From 1936fea1e2992e625722406765a2520f97f318d7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: [PATCH 1/2] iomap: write iomap validity checks mainline inclusion from mainline-v6.1-rc8 commit d7b64041164ca177170191d2ad775da074ab2926 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWO90 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d7b64041164ca177170191d2ad775da074ab2926 -------------------------------- A recent multithreaded write data corruption has been uncovered in the iomap write code. The core of the problem is partial folio writes can be flushed to disk while a new racing write can map it and fill the rest of the page: writeback new write allocate blocks blocks are unwritten submit IO ..... map blocks iomap indicates UNWRITTEN range loop { lock folio copyin data ..... IO completes runs unwritten extent conv blocks are marked written get next folio } Now add memory pressure such that memory reclaim evicts the partially written folio that has already been written to disk. When the new write finally gets to the last partial page of the new write, it does not find it in cache, so it instantiates a new page, sees the iomap is unwritten, and zeros the part of the page that it does not have data from. This overwrites the data on disk that was originally written. The full description of the corruption mechanism can be found here: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ To solve this problem, we need to check whether the iomap is still valid after we lock each folio during the write. We have to do it after we lock the page so that we don't end up with state changes occurring while we wait for the folio to be locked. Hence we need a mechanism to be able to check that the cached iomap is still valid (similar to what we already do in buffered writeback), and we need a way for ->begin_write to back out and tell the high level iomap iterator that we need to remap the remaining write range. The iomap needs to grow some storage for the validity cookie that the filesystem provides to travel with the iomap. XFS, in particular, also needs to know some more information about what the iomap maps (attribute extents rather than file data extents) to for the validity cookie to cover all the types of iomaps we might need to validate. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Gou Hao Signed-off-by: jiazhenyuan --- fs/iomap.c | 29 +++++++++++++++++++++++++++++ include/linux/iomap.h | 32 +++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/fs/iomap.c b/fs/iomap.c index d7234c7ac95e..7809b47395fc 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -745,6 +745,28 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, if (!page) return -ENOMEM; + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (iomap->iomap_valid) { + bool iomap_valid = iomap->iomap_valid(inode, iomap); + if (!iomap_valid) { + iomap->flags |= IOMAP_F_STALE; + status = 0; + unlock_page(page); + put_page(page); + page = NULL; + goto out; + } + } + if (iomap->type == IOMAP_INLINE) iomap_read_inline_data(inode, page, iomap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) @@ -759,6 +781,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, iomap_write_failed(inode, pos, len); } +out: *pagep = page; return status; } @@ -895,6 +918,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, iomap); if (unlikely(status)) break; + if (iomap->flags & IOMAP_F_STALE) + break; if (mapping_writably_mapped(inode->i_mapping)) flush_dcache_page(page); @@ -995,6 +1020,8 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, put_page(rpage); if (unlikely(status)) return status; + if (iomap->flags & IOMAP_F_STALE) + break; WARN_ON_ONCE(!PageUptodate(page)); @@ -1046,6 +1073,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, iomap); if (status) return status; + if (iomap->flags & IOMAP_F_STALE) + return status; zero_user(page, offset, bytes); mark_page_accessed(page); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ce1290ac0c3..4bd65630bb75 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -26,7 +26,6 @@ struct vm_fault; #define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ #define IOMAP_INLINE 0x05 /* data inline in the inode */ - /* * Flags for all iomap mappings: * @@ -42,6 +41,19 @@ struct vm_fault; */ #define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ #define IOMAP_F_SHARED 0x20 /* block shared with another file */ +/* + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. + */ +#define IOMAP_F_XATTR 0x40 + +/* + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. + */ +#define IOMAP_F_STALE 0x200 /* * Flags from 0x1000 up are for file system specific usage: @@ -67,6 +79,7 @@ struct iomap { struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; void *private; /* filesystem private */ + u64 validity_cookie; /* used with .iomap_valid() */ /* * Called when finished processing a page in the mapping returned in @@ -75,6 +88,23 @@ struct iomap { */ void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page, struct iomap *iomap); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* -- Gitee From 61bd143ee424e000b1170b66c0018f3136a0e030 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: [PATCH 2/2] xfs: use iomap_valid method to detect stale cached iomaps mainline inclusion from mainline-v6.1-rc8 commit 304a68b9c63bbfc1f6e159d68e8892fc54a06067 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWO90 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=304a68b9c63bbfc1f6e159d68e8892fc54a06067 -------------------------------- Now that iomap supports a mechanism to validate cached iomaps for buffered write operations, hook it up to the XFS buffered write ops so that we can avoid data corruptions that result from stale cached iomaps. See: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ or the ->iomap_valid() introduction commit for exact details of the corruption vector. The validity cookie we store in the iomap is based on the type of iomap we return. It is expected that the iomap->flags we set in xfs_bmbt_to_iomap() is not perturbed by the iomap core and are returned to us in the iomap passed via the .iomap_valid() callback. This ensures that the validity cookie is always checking the correct inode fork sequence numbers to detect potential changes that affect the extent cached by the iomap. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Gou Hao Signed-off-by: jiazhenyuan --- fs/xfs/xfs_iomap.c | 54 ++++++++++++++++++++++++++++++++++++++++------ fs/xfs/xfs_iomap.h | 5 +++-- fs/xfs/xfs_pnfs.c | 6 ++++-- 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 12df214edbe1..6ebcc03c851e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -35,11 +35,39 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_afp->if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + return iomap->validity_cookie == + xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); +} + void xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; @@ -60,6 +88,9 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + iomap->validity_cookie = sequence_cookie; + iomap->iomap_valid = xfs_iomap_valid; } xfs_extlen_t @@ -146,7 +177,8 @@ xfs_iomap_write_direct( xfs_off_t offset, size_t count, xfs_bmbt_irec_t *imap, - int nmaps) + int nmaps, + u64 *seq) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -277,6 +309,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, lockmode); return error; @@ -515,6 +548,7 @@ xfs_file_iomap_begin_delay( struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fsblock_t prealloc_blocks = 0; + u64 seq; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -552,6 +586,7 @@ xfs_file_iomap_begin_delay( } trace_xfs_iomap_found(ip, offset, count, 0, &got); + seq = xfs_iomap_inode_sequence(ip, 0); goto done; } @@ -618,6 +653,7 @@ xfs_file_iomap_begin_delay( * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -629,7 +665,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - xfs_bmbt_to_iomap(ip, iomap, &got); + xfs_bmbt_to_iomap(ip, iomap, &got, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -948,6 +984,7 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; + u64 seq = 0; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1048,7 +1085,7 @@ xfs_file_iomap_begin( if (lockmode == XFS_ILOCK_EXCL) xfs_ilock_demote(ip, lockmode); error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); + nimaps, &seq); if (error) return error; @@ -1060,7 +1097,9 @@ xfs_file_iomap_begin( & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; - xfs_bmbt_to_iomap(ip, iomap, &imap); + if (!seq) + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); + xfs_bmbt_to_iomap(ip, iomap, &imap, seq); if (shared) iomap->flags |= IOMAP_F_SHARED; @@ -1068,6 +1107,7 @@ xfs_file_iomap_begin( out_found: ASSERT(nimaps); + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, 0, &imap); goto out_finish; @@ -1169,6 +1209,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1184,12 +1225,13 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); out_unlock: xfs_iunlock(ip, lockmode); if (!error) { ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap, seq); } return error; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831b..14ff74627c26 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,14 @@ struct xfs_inode; struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int); + struct xfs_bmbt_irec *, int, u64 *); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); +u64 xfs_iomap_inode_sequence(struct xfs_inode *, u16); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); + struct xfs_bmbt_irec *, u64 ); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 1c9bced3e860..1d988086c90a 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -98,6 +98,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -149,6 +150,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, lock_flags); if (error) @@ -166,7 +168,7 @@ xfs_fs_map_blocks( */ xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_iomap_write_direct(ip, offset, length, - &imap, nimaps); + &imap, nimaps, &seq); if (error) goto out_unlock; @@ -185,7 +187,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap, seq); *device_generation = mp->m_generation; return error; out_unlock: -- Gitee