From 9b61f8ea28fc08622e2681a04bf2c849ebef3f0a Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Fri, 30 Aug 2024 13:37:37 +0800 Subject: [PATCH 01/14] ext4: hoist ext4_block_write_begin and replace the __block_write_begin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #21672 commit 6b730a405037501a260d6efd24782d2737e65d07 upstream. Using __block_write_begin() make it inconvenient to journal the user data dirty process. We can't tell the block layer maintainer, ‘Hey, we want to trace the dirty user data in ext4, can we add some special code for ext4 in __block_write_begin?’:P So use ext4_block_write_begin() instead. The two functions are basically doing the same thing except for the fscrypt related code. Remove the unnecessary #ifdef since fscrypt_inode_uses_fs_layer_crypto() returns false (and it's known at compile time) when !CONFIG_FS_ENCRYPTION. And hoist the ext4_block_write_begin so that it can be used in other files. Suggested-by: Jan Kara Suggested-by: Eric Biggers Signed-off-by: Shida Zhang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830053739.3588573-3-zhangshida@kylinos.cn Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/ext4.h | 2 ++ fs/ext4/inline.c | 10 +++++----- fs/ext4/inode.c | 24 +++++------------------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 51560a95290f..e1363b64de6e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3835,6 +3835,8 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +extern int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, + get_block_t *get_block); #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index dc6a23a94c14..ae85a7937957 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,10 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(folio, from, to, - ext4_get_block_unwritten); + ret = ext4_block_write_begin(folio, from, to, + ext4_get_block_unwritten); } else - ret = __block_write_begin(folio, from, to, ext4_get_block); + ret = ext4_block_write_begin(folio, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -856,8 +856,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(folio, 0, inline_size, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(folio, 0, inline_size, + ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ca451977882..336be09a8bad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1012,10 +1012,10 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* - * __block_write_begin() could have dirtied some buffers. Clean + * ext4_block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear + * by ext4_block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ @@ -1029,9 +1029,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, return ret; } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, - get_block_t *get_block) +int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, + get_block_t *get_block) { unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; @@ -1121,7 +1120,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, return err; } -#endif /* * To preserve ordering, it is essential that the hole instantiation and @@ -1210,19 +1208,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); -#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); -#else - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(folio, pos, len, - ext4_get_block_unwritten); - else - ret = __block_write_begin(folio, pos, len, ext4_get_block); -#endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -1235,7 +1225,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, folio_unlock(folio); /* - * __block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * @@ -2955,11 +2945,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; -#ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#else - ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#endif if (ret < 0) { folio_unlock(folio); folio_put(folio); -- Gitee From e6dc7483e9f354063739a5a3d541ec3750ccd994 Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Fri, 30 Aug 2024 13:37:38 +0800 Subject: [PATCH 02/14] ext4: fix a potential assertion failure due to improperly dirtied buffer ANBZ: #21672 commit cb3de5fc876ee9ef2b830c9e6cdafac5c90903ef upstream. On an old kernel version(4.19, ext3, data=journal, pagesize=64k), an assertion failure will occasionally be triggered by the line below: ----------- jbd2_journal_commit_transaction { ... J_ASSERT_BH(bh, !buffer_dirty(bh)); /* * The buffer on BJ_Forget list and not jbddirty means ... } ----------- The same condition may also be applied to the lattest kernel version. When blocksize < pagesize and we truncate a file, there can be buffers in the mapping tail page beyond i_size. These buffers will be filed to transaction's BJ_Forget list by ext4_journalled_invalidatepage() during truncation. When the transaction doing truncate starts committing, we can grow the file again. This calls __block_write_begin() which allocates new blocks under these buffers in the tail page we go through the branch: if (buffer_new(bh)) { clean_bdev_bh_alias(bh); if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } ... } Hence buffers on BJ_Forget list of the committing transaction get marked dirty and this triggers the jbd2 assertion. Teach ext4_block_write_begin() to properly handle files with data journalling by avoiding dirtying them directly. Instead of folio_zero_new_buffers() we use ext4_journalled_zero_new_buffers() which takes care of handling journalling. We also don't need to mark new uptodate buffers as dirty in ext4_block_write_begin(). That will be either done either by block_commit_write() in case of success or by folio_zero_new_buffers() in case of failure. Reported-by: Baolin Liu Suggested-by: Jan Kara Signed-off-by: Shida Zhang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830053739.3588573-4-zhangshida@kylinos.cn Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/ext4.h | 3 ++- fs/ext4/inline.c | 7 ++++--- fs/ext4/inode.c | 42 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e1363b64de6e..f30b1440ac2e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3835,7 +3835,8 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } -extern int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, +extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, get_block_t *get_block); #endif /* __KERNEL__ */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index ae85a7937957..9168e117ba2c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,11 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, goto out; if (ext4_should_dioread_nolock(inode)) { - ret = ext4_block_write_begin(folio, from, to, + ret = ext4_block_write_begin(handle, folio, from, to, ext4_get_block_unwritten); } else - ret = ext4_block_write_begin(folio, from, to, ext4_get_block); + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -856,7 +857,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = ext4_block_write_begin(folio, 0, inline_size, + ret = ext4_block_write_begin(NULL, folio, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 336be09a8bad..ac7f89304bfe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -50,6 +50,11 @@ #include +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -1029,7 +1034,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, return ret; } -int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, get_block_t *get_block) { unsigned int from = offset_in_folio(folio, pos); @@ -1043,6 +1049,7 @@ int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; + bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); @@ -1071,10 +1078,22 @@ int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); if (folio_test_uptodate(folio)) { - clear_buffer_new(bh); + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ set_buffer_uptodate(bh); - mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) @@ -1104,7 +1123,11 @@ int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - folio_zero_new_buffers(folio, from, to); + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1209,10 +1232,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(folio, pos, len, + ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -2945,7 +2969,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; - ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); @@ -6279,7 +6304,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(folio, 0, len, ext4_get_block); + err = ext4_block_write_begin(handle, folio, 0, len, + ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) -- Gitee From 66afe5edf9d1151db5239347644b6a8cdbc5712f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:04 +0800 Subject: [PATCH 03/14] ext4: process folios writeback in bytes ANBZ: #21672 commit 1bfe6354e0975fe89c3d25e81b6546d205556a4b ext4. Since ext4 supports large folios, processing writebacks in pages is no longer appropriate, it can be modified to process writebacks in bytes. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 70 +++++++++++++++++++------------------ include/trace/events/ext4.h | 13 ++++--- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ac7f89304bfe..2beee81428bc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1565,11 +1565,12 @@ struct mpage_da_data { unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ - pgoff_t first_page; /* The first page to write */ - pgoff_t next_page; /* Current page to examine */ - pgoff_t last_page; /* Last page to examine */ + loff_t start_pos; /* The start pos to write */ + loff_t next_pos; /* Current pos to examine */ + loff_t end_pos; /* Last pos to examine */ + /* - * Extent to map - this can be after first_page because that can be + * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ @@ -1589,38 +1590,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - /* This is necessary when next_page == 0. */ - if (mpd->first_page >= mpd->next_page) + /* This is necessary when next_pos == 0. */ + if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; - index = mpd->first_page; - end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; - start = index << (PAGE_SHIFT - inode->i_blkbits); - last = end << (PAGE_SHIFT - inode->i_blkbits); + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); + last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); - ext4_es_remove_extent(inode, start, last - start + 1); + ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); - while (index <= end) { - nr = filemap_get_folios(mapping, &index, end, &fbatch); + index = mpd->start_pos >> PAGE_SHIFT; + end = mpd->next_pos >> PAGE_SHIFT; + while (index < end) { + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - if (folio->index < mpd->first_page) + if (folio_pos(folio) < mpd->start_pos) continue; - if (folio_next_index(folio) - 1 > end) + if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -1920,7 +1921,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page += folio_nr_pages(folio); + mpd->start_pos += folio_size(folio); folio_unlock(folio); } @@ -1930,7 +1931,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) loff_t size; int err; - BUG_ON(folio->index != mpd->first_page); + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path @@ -2338,7 +2339,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ - disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; + disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; @@ -2441,8 +2442,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - pgoff_t index = mpd->first_page; - pgoff_t end = mpd->last_page; + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; @@ -2457,7 +2458,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; - mpd->next_page = index; + mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); @@ -2488,7 +2489,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != folio->index) + if (mpd->map.m_len > 0 && + mpd->next_pos != folio_pos(folio)) goto out; if (handle) { @@ -2534,8 +2536,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) } if (mpd->map.m_len == 0) - mpd->first_page = folio->index; - mpd->next_page = folio_next_index(folio); + mpd->start_pos = folio_pos(folio); + mpd->next_pos = folio_pos(folio) + folio_size(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2679,18 +2681,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd->first_page = writeback_index; - mpd->last_page = -1; + mpd->start_pos = writeback_index << PAGE_SHIFT; + mpd->end_pos = LLONG_MAX; } else { - mpd->first_page = wbc->range_start >> PAGE_SHIFT; - mpd->last_page = wbc->range_end >> PAGE_SHIFT; + mpd->start_pos = wbc->range_start; + mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd->first_page, - mpd->last_page); + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, + mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* @@ -2750,7 +2752,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) } mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + trace_ext4_da_write_pages(inode, mpd->start_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, @@ -2807,8 +2809,8 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd->last_page = writeback_index - 1; - mpd->first_page = 0; + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; + mpd->start_pos = 0; goto retry; } @@ -2818,7 +2820,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd->first_page; + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 2fe78c23861d..5e3c2e358dd7 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -483,15 +483,15 @@ TRACE_EVENT(ext4_writepages, ); TRACE_EVENT(ext4_da_write_pages, - TP_PROTO(struct inode *inode, pgoff_t first_page, + TP_PROTO(struct inode *inode, loff_t start_pos, struct writeback_control *wbc), - TP_ARGS(inode, first_page, wbc), + TP_ARGS(inode, start_pos, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( pgoff_t, first_page ) + __field( loff_t, start_pos ) __field( long, nr_to_write ) __field( int, sync_mode ) ), @@ -499,15 +499,14 @@ TRACE_EVENT(ext4_da_write_pages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->first_page = first_page; + __entry->start_pos = start_pos; __entry->nr_to_write = wbc->nr_to_write; __entry->sync_mode = wbc->sync_mode; ), - TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " - "sync_mode %d", + TP_printk("dev %d,%d ino %lu start_pos 0x%llx nr_to_write %ld sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->first_page, + (unsigned long) __entry->ino, __entry->start_pos, __entry->nr_to_write, __entry->sync_mode) ); -- Gitee From 68c449999884fb731fb7a04914414184fdb661bc Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:05 +0800 Subject: [PATCH 04/14] ext4: move the calculation of wbc->nr_to_write to mpage_folio_done() ANBZ: #21672 commit f922c8c2461b022a2efd9914484901fb358a5b2a ext4. mpage_folio_done() should be a more appropriate place than mpage_submit_folio() for updating the wbc->nr_to_write after we have submitted a fully mapped folio. Preparing to make mpage_submit_folio() allows to submit partially mapped folio that is still under processing. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250707140814.542883-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2beee81428bc..914b1eba3dad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1922,6 +1922,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->start_pos += folio_size(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -1952,8 +1953,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); - if (!err) - mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } -- Gitee From 49f700bffeb3f113349b097bd34baf8877554b13 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:06 +0800 Subject: [PATCH 05/14] ext4: fix stale data if it bail out of the extents mapping loop ANBZ: #21672 commit ded2d726a3041fce8afd88005cbfe15cd4737702 ext4. During the process of writing back folios, if mpage_map_and_submit_extent() exits the extent mapping loop due to an ENOSPC or ENOMEM error, it may result in stale data or filesystem inconsistency in environments where the block size is smaller than the folio size. When mapping a discontinuous folio in mpage_map_and_submit_extent(), some buffers may have already be mapped. If we exit the mapping loop prematurely, the folio data within the mapped range will not be written back, and the file's disk size will not be updated. Once the transaction that includes this range of extents is committed, this can lead to stale data or filesystem inconsistency. Fix this by submitting the current processing partially mapped folio. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 914b1eba3dad..f52aa05584da 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2253,6 +2253,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) return 0; } +/* + * This is used to submit mapped buffers in a single folio that is not fully + * mapped for various reasons, such as insufficient space or journal credits. + */ +static int mpage_submit_partial_folio(struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct folio *folio; + loff_t pos; + int ret; + + folio = filemap_get_folio(inode->i_mapping, + mpd->start_pos >> PAGE_SHIFT); + if (IS_ERR(folio)) + return PTR_ERR(folio); + /* + * The mapped position should be within the current processing folio + * but must not be the folio start position. + */ + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; + if (WARN_ON_ONCE((folio_pos(folio) == pos) || + !folio_contains(folio, pos >> PAGE_SHIFT))) + return -EINVAL; + + ret = mpage_submit_folio(mpd, folio); + if (ret) + goto out; + /* + * Update start_pos to prevent this folio from being released in + * mpage_release_unused_pages(), it will be reset to the aligned folio + * pos when this folio is written again in the next round. Additionally, + * do not update wbc->nr_to_write here, as it will be updated once the + * entire folio has finished processing. + */ + mpd->start_pos = pos; +out: + folio_unlock(folio); + folio_put(folio); + return ret; +} + /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO @@ -2303,8 +2344,16 @@ static int mpage_map_and_submit_extent(handle_t *handle, */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { - if (progress) + /* + * We may have already allocated extents for + * some bhs inside the folio, issue the + * corresponding data to prevent stale data. + */ + if (progress) { + if (mpage_submit_partial_folio(mpd)) + goto invalidate_dirty_pages; goto update_disksize; + } return err; } ext4_msg(sb, KERN_CRIT, -- Gitee From 57ba77f886e961aaabb051567af611bdcc6f643c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 1 Jul 2025 21:06:29 +0800 Subject: [PATCH 06/14] ext4: refactor the block allocation process of ext4_page_mkwrite() ANBZ: #21672 commit 2bddafea3d0d85ee9ac3cf5ba9a4b2f2d2f50257 ext4. The block allocation process and error handling in ext4_page_mkwrite() is complex now. Refactor it by introducing a new helper function, ext4_block_page_mkwrite(). It will call ext4_block_write_begin() to allocate blocks instead of directly calling block_page_mkwrite(). Preparing to implement retry logic in a subsequent patch to address situations where the reserved journal credits are insufficient. Additionally, this modification will help prevent potential deadlocks that may occur when waiting for folio writeback while holding the transaction handle. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250707140814.542883-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 95 ++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f52aa05584da..bea55c54be0c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6245,6 +6245,53 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, return !buffer_mapped(bh); } +static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, + get_block_t get_block) +{ + handle_t *handle; + loff_t size; + unsigned long len; + int ret; + + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { + ret = -EFAULT; + goto out_error; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); + if (ret) + goto out_error; + + if (!ext4_should_journal_data(inode)) { + block_commit_write(folio, 0, len); + folio_mark_dirty(folio); + } else { + ret = ext4_journal_folio_buffers(handle, folio, len); + if (ret) + goto out_error; + } + ext4_journal_stop(handle); + folio_wait_stable(folio); + return ret; + +out_error: + folio_unlock(folio); + ext4_journal_stop(handle); + return ret; +} + vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6256,8 +6303,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - handle_t *handle; - get_block_t *get_block; + get_block_t *get_block = ext4_get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) @@ -6325,46 +6371,9 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; - else - get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - /* - * Data journalling can't use block_page_mkwrite() because it - * will set_buffer_dirty() before do_journal_get_write_access() - * thus might hit warning messages for dirty metadata buffers. - */ - if (!ext4_should_journal_data(inode)) { - err = block_page_mkwrite(vma, vmf, get_block); - } else { - folio_lock(folio); - size = i_size_read(inode); - /* Page got truncated from under us? */ - if (folio->mapping != mapping || folio_pos(folio) > size) { - ret = VM_FAULT_NOPAGE; - goto out_error; - } - - len = folio_size(folio); - if (folio_pos(folio) + len > size) - len = size - folio_pos(folio); - - err = ext4_block_write_begin(handle, folio, 0, len, - ext4_get_block); - if (!err) { - ret = VM_FAULT_SIGBUS; - if (ext4_journal_folio_buffers(handle, folio, len)) - goto out_error; - } else { - folio_unlock(folio); - } - } - ext4_journal_stop(handle); + /* Start journal and allocate blocks */ + err = ext4_block_page_mkwrite(inode, folio, get_block); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: @@ -6373,8 +6382,4 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; -out_error: - folio_unlock(folio); - ext4_journal_stop(handle); - goto out; } -- Gitee From 5694efc21267ab7ad9f21c3655410f4238b0bed1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:08 +0800 Subject: [PATCH 07/14] ext4: restart handle if credits are insufficient during allocating blocks ANBZ: #21672 commit e2c4c49dee64ca2f42ad2958cbe1805de96b6732 ext4. After large folios are supported on ext4, writing back a sufficiently large and discontinuous folio may consume a significant number of journal credits, placing considerable strain on the journal. For example, in a 20GB filesystem with 1K block size and 1MB journal size, writing back a 2MB folio could require thousands of credits in the worst-case scenario (when each block is discontinuous and distributed across different block groups), potentially exceeding the journal size. This issue can also occur in ext4_write_begin() and ext4_page_mkwrite() when delalloc is not enabled. Fix this by ensuring that there are sufficient journal credits before allocating an extent in mpage_map_one_extent() and ext4_block_write_begin(). If there are not enough credits, return -EAGAIN, exit the current mapping loop, restart a new handle and a new transaction, and allocating blocks on this folio again in the next iteration. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bea55c54be0c..a905eb6063d2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -767,6 +767,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } +/* + * Make sure that the current journal transaction has enough credits to map + * one extent. Return -EAGAIN if it cannot extend the current running + * transaction. + */ +static inline int ext4_journal_ensure_extent_credits(handle_t *handle, + struct inode *inode) +{ + int credits; + int ret; + + /* Called from ext4_da_write_begin() which has no handle started? */ + if (!handle) + return 0; + + credits = ext4_chunk_trans_blocks(inode, 1); + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); + return ret <= 0 ? ret : -EAGAIN; +} + static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { @@ -1074,7 +1094,9 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); + err = ext4_journal_ensure_extent_credits(handle, inode); + if (!err) + err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { @@ -1272,8 +1294,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ext4_orphan_del(NULL, inode); } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (ret == -EAGAIN || + (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; @@ -2219,6 +2242,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) int get_blocks_flags; int err, dioread_nolock; + /* Make sure transaction has enough credits for this extent */ + err = ext4_journal_ensure_extent_credits(handle, inode); + if (err < 0) + return err; + trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or @@ -2342,7 +2370,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ - if ((err == -ENOMEM) || + if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { /* * We may have already allocated extents for @@ -2849,6 +2877,8 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) ret = 0; continue; } + if (ret == -EAGAIN) + ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; @@ -6374,7 +6404,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) retry_alloc: /* Start journal and allocate blocks */ err = ext4_block_page_mkwrite(inode, folio, get_block); - if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (err == -EAGAIN || + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_alloc; out_ret: ret = vmf_fs_error(err); -- Gitee From bf91abf20bf7e6217d9ddb6d62905ece94b7c5c6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:09 +0800 Subject: [PATCH 08/14] ext4: enhance tracepoints during the folios writeback ANBZ: #21672 commit 6b132759b0fe78e518abafb62190c294100db6d6 ext4. After mpage_map_and_submit_extent() supports restarting handle if credits are insufficient during allocating blocks, it is more likely to exit the current mapping iteration and continue to process the current processing partially mapped folio again. The existing tracepoints are not sufficient to track this situation, so enhance the tracepoints to track the writeback position and the return value before and after submitting the folios. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 5 ++++- include/trace/events/ext4.h | 42 ++++++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a905eb6063d2..0bff569b4c9e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2828,7 +2828,8 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) } mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd->start_pos, wbc); + trace_ext4_da_write_folios_start(inode, mpd->start_pos, + mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, @@ -2866,6 +2867,8 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; + trace_ext4_da_write_folios_end(inode, mpd->start_pos, + mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 5e3c2e358dd7..b1c7278248d9 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -482,16 +482,17 @@ TRACE_EVENT(ext4_writepages, (unsigned long) __entry->writeback_index) ); -TRACE_EVENT(ext4_da_write_pages, - TP_PROTO(struct inode *inode, loff_t start_pos, +TRACE_EVENT(ext4_da_write_folios_start, + TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos, struct writeback_control *wbc), - TP_ARGS(inode, start_pos, wbc), + TP_ARGS(inode, start_pos, next_pos, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, start_pos ) + __field( loff_t, next_pos ) __field( long, nr_to_write ) __field( int, sync_mode ) ), @@ -500,16 +501,47 @@ TRACE_EVENT(ext4_da_write_pages, __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start_pos = start_pos; + __entry->next_pos = next_pos; __entry->nr_to_write = wbc->nr_to_write; __entry->sync_mode = wbc->sync_mode; ), - TP_printk("dev %d,%d ino %lu start_pos 0x%llx nr_to_write %ld sync_mode %d", + TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->start_pos, + (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos, __entry->nr_to_write, __entry->sync_mode) ); +TRACE_EVENT(ext4_da_write_folios_end, + TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos, + struct writeback_control *wbc, int ret), + + TP_ARGS(inode, start_pos, next_pos, wbc, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start_pos ) + __field( loff_t, next_pos ) + __field( long, nr_to_write ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start_pos = start_pos; + __entry->next_pos = next_pos; + __entry->nr_to_write = wbc->nr_to_write; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos, + __entry->nr_to_write, __entry->ret) +); + TRACE_EVENT(ext4_da_write_pages_extent, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), -- Gitee From 45d8986097ea46d673134dfb64eb08629ca41928 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:10 +0800 Subject: [PATCH 09/14] ext4: correct the reserved credits for extent conversion ANBZ: #21672 commit 95ad8ee45cdbc321c135a2db895d48b374ef0f87 ext4. Now, we reserve journal credits for converting extents in only one page to written state when the I/O operation is complete. This is insufficient when large folio is enabled. Fix this by reserving credits for converting up to one extent per block in the largest 2MB folio, this calculation should only involve extents index and leaf blocks, so it should not estimate too many credits. Fixes: 7ac67301e82f ("ext4: enable large folio for regular file") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250707140814.542883-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0bff569b4c9e..7ae42a4ed652 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2742,12 +2742,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { + int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in - * the page and we may dirty the inode. + * the folio and we may dirty the inode. */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) -- Gitee From eb64787f062aeadf714ec4885d006bdf37dba727 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:11 +0800 Subject: [PATCH 10/14] ext4: reserved credits for one extent during the folio writeback ANBZ: #21672 commit bbbf150f3f85619569ac19dc6458cca7c492e715 ext4. After ext4 supports large folios, reserving journal credits for one maximum-ordered folio based on the worst case cenario during the writeback process can easily exceed the maximum transaction credits. Additionally, reserving journal credits for one page is also no longer appropriate. Currently, the folio writeback process can either extend the journal credits or initiate a new transaction if the currently reserved journal credits are insufficient. Therefore, it can be modified to reserve credits for only one extent at the outset. In most cases involving continuous mapping, these credits are generally adequate, and we may only need to perform some basic credit expansion. However, in extreme cases where the block size and folio size differ significantly, or when the folios are sufficiently discontinuous, it may be necessary to restart a new transaction and resubmit the folios. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7ae42a4ed652..923dc6e9049c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2439,21 +2439,6 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } -/* - * Calculate the total number of credits to reserve for one writepages - * iteration. This is called from ext4_writepages(). We map an extent of - * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping - * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + - * bpp - 1 blocks in bpp different extents. - */ -static int ext4_da_writepages_trans_blocks(struct inode *inode) -{ - int bpp = ext4_journal_blocks_per_folio(inode); - - return ext4_meta_trans_blocks(inode, - MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); -} - static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { @@ -2811,8 +2796,14 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = ext4_da_writepages_trans_blocks(inode); - + /* + * Calculate the number of credits needed to reserve for one + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will + * attempt to extend the transaction or start a new iteration + * if the reserved credits are insufficient. + */ + needed_blocks = ext4_chunk_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); -- Gitee From c18969b483ff9e86211b1265cde06b7b7624260d Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:51 +0530 Subject: [PATCH 11/14] ext4: Make ext4_meta_trans_blocks() non-static for later use ANBZ: #21672 commit 255e7bc2127cbd3a718a55d2da5b2d3f015adcd7 upstream. Let's make ext4_meta_trans_blocks() non-static for use in later functions during ->end_io conversion for atomic writes. We will need this function to estimate journal credits for a special case. Instead of adding another wrapper around it, let's make this non-static. Reviewed-by: Ojaswin Mujoo Acked-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/23ce80d4286f792831ce99d13558182ee228fedb.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f30b1440ac2e..62b2f5655cd0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3008,6 +3008,8 @@ extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 923dc6e9049c..0724a64e20b0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -142,9 +142,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -5847,8 +5844,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; -- Gitee From e1ebd8c87111435f022a455add3adc314c0efaa7 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:12 +0800 Subject: [PATCH 12/14] ext4: replace ext4_writepage_trans_blocks() ANBZ: #21672 commit 57661f28756c59510e31543520b5b8f5e591f384 ext4. After ext4 supports large folios, the semantics of reserving credits in pages is no longer applicable. In most scenarios, reserving credits in extents is sufficient. Therefore, introduce ext4_chunk_trans_extent() to replace ext4_writepage_trans_blocks(). move_extent_per_page() is the only remaining location where we are still processing extents in pages. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-10-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 6 +++--- fs/ext4/inline.c | 6 +++--- fs/ext4/inode.c | 33 +++++++++++++++------------------ fs/ext4/move_extent.c | 3 ++- fs/ext4/xattr.c | 2 +- 6 files changed, 25 insertions(+), 27 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 62b2f5655cd0..8ed3dfac1aa8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3005,9 +3005,9 @@ extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); -extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cd1a4bcea76e..6f12a6924477 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5020,7 +5020,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, credits = depth + 2; } - restart_credits = ext4_writepage_trans_blocks(inode); + restart_credits = ext4_chunk_trans_extent(inode, 0); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { @@ -5280,7 +5280,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -5374,7 +5374,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9168e117ba2c..09bf9872dfcf 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -557,7 +557,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1927,7 +1927,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) }; - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2042,7 +2042,7 @@ int ext4_convert_inline_data(struct inode *inode) return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0724a64e20b0..84a1a1566346 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1191,7 +1191,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + needed_blocks = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -4131,7 +4132,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); @@ -4278,7 +4279,7 @@ int ext4_truncate(struct inode *inode) } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); @@ -5878,25 +5879,19 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) } /* - * Calculate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction, - * which may include multiple chunks of block allocations. - * - * This could be called via ext4_write_begin() - * - * We need to consider the worse case, when - * one new block per extent. + * Calculate the journal credits for modifying the number of blocks + * in a single extent within one transaction. 'nrblocks' is used only + * for non-extent inodes. For extent type inodes, 'nrblocks' can be + * zero if the exact number of blocks is unknown. */ -int ext4_writepage_trans_blocks(struct inode *inode) +int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) { - int bpp = ext4_journal_blocks_per_folio(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, bpp); - + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret += bpp; + ret += nrblocks; return ret; } @@ -6271,10 +6266,12 @@ static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, handle_t *handle; loff_t size; unsigned long len; + int credits; int ret; - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); + credits = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 782b64a0fa4b..23e0f9f6b107 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -278,7 +278,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, */ again: *err = 0; - jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, + block_len_in_page) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ca22aa9e04b4..a56f7a8779ed 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -979,7 +979,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; + credits += ext4_chunk_trans_extent(inode, 1) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) -- Gitee From e4d8e4888eeb710d2b45f3c3b84e183804884b33 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:13 +0800 Subject: [PATCH 13/14] ext4: fix insufficient credits calculation in ext4_meta_trans_blocks() ANBZ: #21672 commit 5137d6c8906b55b3c7b5d1aa5a549753ec8520f5 ext4. The calculation of journal credits in ext4_meta_trans_blocks() should include pextents, as each extent separately may be allocated from a different group and thus need to update different bitmap and group descriptor block. Fixes: 0e32d8617012 ("ext4: correct the journal credits calculations of allocating blocks") Reported-by: Jan Kara Closes: https://lore.kernel.org/linux-ext4/nhxfuu53wyacsrq7xqgxvgzcggyscu2tbabginahcygvmc45hy@t4fvmyeky33e/ Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250707140814.542883-11-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84a1a1566346..3d7c0685c8cc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5853,7 +5853,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) int ret; /* - * How many index and lead blocks need to touch to map @lblocks + * How many index and leaf blocks need to touch to map @lblocks * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); @@ -5862,7 +5862,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; -- Gitee From f2eab1a87503129b72a04b8a95a1ab8fdee4d8e1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 7 Jul 2025 22:08:14 +0800 Subject: [PATCH 14/14] ext4: limit the maximum folio order ANBZ: #21672 commit b12f423d598fd874df9ecfb2436789d582fda8e6 ext4. In environments with a page size of 64KB, the maximum size of a folio can reach up to 128MB. Consequently, during the write-back of folios, the 'rsv_blocks' will be overestimated to 1,577, which can make pressure on the journal space where the journal is small. This can easily exceed the limit of a single transaction. Besides, an excessively large folio is meaningless and will instead increase the overhead of traversing the bhs within the folio. Therefore, limit the maximum order of a folio to 2048 filesystem blocks. Reported-by: Naresh Kamboju Reported-by: Joseph Qi Closes: https://lore.kernel.org/linux-ext4/CA+G9fYsyYQ3ZL4xaSg1-Tt5Evto7Zd+hgNWZEa9cQLbahA1+xg@mail.gmail.com/ Signed-off-by: Zhang Yi Tested-by: Joseph Qi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250707140814.542883-12-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Joseph Qi --- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 3 +-- fs/ext4/inode.c | 22 +++++++++++++++++++--- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8ed3dfac1aa8..f2555e6262d7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2961,7 +2961,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); -bool ext4_should_enable_large_folio(struct inode *inode); +void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9f540530bb85..97223746a15f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1346,8 +1346,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, ei->i_datasync_tid = handle->h_transaction->t_tid; } - if (ext4_should_enable_large_folio(inode)) - mapping_set_large_folios(inode->i_mapping); + ext4_set_inode_mapping_order(inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3d7c0685c8cc..e8b54a6531ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4840,7 +4840,7 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return -EFSCORRUPTED; } -bool ext4_should_enable_large_folio(struct inode *inode) +static bool ext4_should_enable_large_folio(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -4857,6 +4857,22 @@ bool ext4_should_enable_large_folio(struct inode *inode) return true; } +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(i) \ + umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) +void ext4_set_inode_mapping_order(struct inode *inode) +{ + if (!ext4_should_enable_large_folio(inode)) + return; + + mapping_set_folio_order_range(inode->i_mapping, 0, + EXT4_MAX_PAGECACHE_ORDER(inode)); +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -5169,8 +5185,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if (ext4_should_enable_large_folio(inode)) - mapping_set_large_folios(inode->i_mapping); + + ext4_set_inode_mapping_order(inode); ret = check_igot_inode(inode, flags, function, line); /* -- Gitee