diff --git a/fs/iomap.c b/fs/iomap.c index d7234c7ac95e25f8a981dce3b680df5e0d6f648b..7809b47395fc6687c75301bcba3cb431fc9bfc77 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -745,6 +745,28 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, if (!page) return -ENOMEM; + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (iomap->iomap_valid) { + bool iomap_valid = iomap->iomap_valid(inode, iomap); + if (!iomap_valid) { + iomap->flags |= IOMAP_F_STALE; + status = 0; + unlock_page(page); + put_page(page); + page = NULL; + goto out; + } + } + if (iomap->type == IOMAP_INLINE) iomap_read_inline_data(inode, page, iomap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) @@ -759,6 +781,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, iomap_write_failed(inode, pos, len); } +out: *pagep = page; return status; } @@ -895,6 +918,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, iomap); if (unlikely(status)) break; + if (iomap->flags & IOMAP_F_STALE) + break; if (mapping_writably_mapped(inode->i_mapping)) flush_dcache_page(page); @@ -995,6 +1020,8 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, put_page(rpage); if (unlikely(status)) return status; + if (iomap->flags & IOMAP_F_STALE) + break; WARN_ON_ONCE(!PageUptodate(page)); @@ -1046,6 +1073,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, iomap); if (status) return status; + if (iomap->flags & IOMAP_F_STALE) + return status; zero_user(page, offset, bytes); mark_page_accessed(page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 12df214edbe1ea631c40073f7c547adc41b29c5d..6ebcc03c851ef527057ada6018fbff9b6c58aeb7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -35,11 +35,39 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_afp->if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + return iomap->validity_cookie == + xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); +} + void xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; @@ -60,6 +88,9 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + iomap->validity_cookie = sequence_cookie; + iomap->iomap_valid = xfs_iomap_valid; } xfs_extlen_t @@ -146,7 +177,8 @@ xfs_iomap_write_direct( xfs_off_t offset, size_t count, xfs_bmbt_irec_t *imap, - int nmaps) + int nmaps, + u64 *seq) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -277,6 +309,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, lockmode); return error; @@ -515,6 +548,7 @@ xfs_file_iomap_begin_delay( struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fsblock_t prealloc_blocks = 0; + u64 seq; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -552,6 +586,7 @@ xfs_file_iomap_begin_delay( } trace_xfs_iomap_found(ip, offset, count, 0, &got); + seq = xfs_iomap_inode_sequence(ip, 0); goto done; } @@ -618,6 +653,7 @@ xfs_file_iomap_begin_delay( * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -629,7 +665,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - xfs_bmbt_to_iomap(ip, iomap, &got); + xfs_bmbt_to_iomap(ip, iomap, &got, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -948,6 +984,7 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; + u64 seq = 0; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1048,7 +1085,7 @@ xfs_file_iomap_begin( if (lockmode == XFS_ILOCK_EXCL) xfs_ilock_demote(ip, lockmode); error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); + nimaps, &seq); if (error) return error; @@ -1060,7 +1097,9 @@ xfs_file_iomap_begin( & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; - xfs_bmbt_to_iomap(ip, iomap, &imap); + if (!seq) + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); + xfs_bmbt_to_iomap(ip, iomap, &imap, seq); if (shared) iomap->flags |= IOMAP_F_SHARED; @@ -1068,6 +1107,7 @@ xfs_file_iomap_begin( out_found: ASSERT(nimaps); + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, 0, &imap); goto out_finish; @@ -1169,6 +1209,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1184,12 +1225,13 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); out_unlock: xfs_iunlock(ip, lockmode); if (!error) { ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap, seq); } return error; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831bec4da8b7cea7a1847e60dcdc7e82..14ff74627c2658dbe2545b1c9626e81fd5cf57c1 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,14 @@ struct xfs_inode; struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int); + struct xfs_bmbt_irec *, int, u64 *); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); +u64 xfs_iomap_inode_sequence(struct xfs_inode *, u16); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); + struct xfs_bmbt_irec *, u64 ); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 1c9bced3e8601dafd6256a3a76419353867134bc..1d988086c90addfd503779093814b07785c1e3e2 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -98,6 +98,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -149,6 +150,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, lock_flags); if (error) @@ -166,7 +168,7 @@ xfs_fs_map_blocks( */ xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_iomap_write_direct(ip, offset, length, - &imap, nimaps); + &imap, nimaps, &seq); if (error) goto out_unlock; @@ -185,7 +187,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap, seq); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ce1290ac0c3fa3f1c6a88d55ccb11ccad7dbc71..4bd65630bb757c62975114c8e6fb7c29e9ec087a 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -26,7 +26,6 @@ struct vm_fault; #define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ #define IOMAP_INLINE 0x05 /* data inline in the inode */ - /* * Flags for all iomap mappings: * @@ -42,6 +41,19 @@ struct vm_fault; */ #define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ #define IOMAP_F_SHARED 0x20 /* block shared with another file */ +/* + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. + */ +#define IOMAP_F_XATTR 0x40 + +/* + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. + */ +#define IOMAP_F_STALE 0x200 /* * Flags from 0x1000 up are for file system specific usage: @@ -67,6 +79,7 @@ struct iomap { struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; void *private; /* filesystem private */ + u64 validity_cookie; /* used with .iomap_valid() */ /* * Called when finished processing a page in the mapping returned in @@ -75,6 +88,23 @@ struct iomap { */ void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page, struct iomap *iomap); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /*