diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c900a39666e38bd75c9829c3e595c9101e65c138..eb05e7946f9438afdf14da0af7eadd3bcd7078bb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8084,10 +8084,10 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (current->journal_info) ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, - &btrfs_sync_dops, is_sync_kiocb(iocb)); + &btrfs_sync_dops, 0); else ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, - &btrfs_dio_ops, is_sync_kiocb(iocb)); + &btrfs_dio_ops, 0); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index e474e064d65c377790012a3e90a5d4231fdf82a6..600c13fba91ace6a63b484ea9b2b2913c035812a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -75,8 +75,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, - is_sync_kiocb(iocb)); + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -479,8 +478,10 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + const struct iomap_dio_ops *iomap_dops = &ext4_dio_write_ops; bool extend = false, unaligned_io = false; bool ilock_shared = true; + int dio_flags = 0; /* * We initially start with shared inode lock unless it is @@ -570,10 +571,14 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_journal_stop(handle); } - if (ilock_shared) + if (ilock_shared) { iomap_ops = &ext4_iomap_overwrite_ops; - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_io || extend); + iomap_dops = NULL; + dio_flags = IOMAP_DIO_MAY_INLINE_COMP; + } + if (unaligned_io || extend) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + ret = iomap_dio_rw(iocb, from, iomap_ops, iomap_dops, dio_flags); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 55a8eb3c196341ebfca0cde0aa443d76b55f8dc9..24ab28f02004b42e71e428724d51287700d70b93 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -798,9 +798,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, if (ret) goto out_uninit; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - is_sync_kiocb(iocb)); - + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); @@ -834,8 +832,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, if (offset + len > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - is_sync_kiocb(iocb)); + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); if (ret == -ENOTBLK) ret = 0; out: diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 933f234d5becd01a7ab369abbc7476a6d0b89b14..892a4f8109e50d32df2b6cc55126156bae8c65fd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -18,10 +18,11 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_WRITE_FUA (1 << 28) -#define IOMAP_DIO_NEED_SYNC (1 << 29) -#define IOMAP_DIO_WRITE (1 << 30) -#define IOMAP_DIO_DIRTY (1 << 31) +#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_WRITE_FUA (1U << 28) +#define IOMAP_DIO_NEED_SYNC (1U << 29) +#define IOMAP_DIO_WRITE (1U << 30) +#define IOMAP_DIO_DIRTY (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -109,7 +110,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * zeros from unwritten extents. */ if (!dio->error && dio->size && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages && + !(dio->flags & IOMAP_DIO_INLINE_COMP)) { int err; err = invalidate_inode_pages2_range(inode->i_mapping, offset >> PAGE_SHIFT, @@ -123,8 +125,10 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * If this is a DSYNC write, make sure we push it to stable storage now * that we've written data. */ - if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) { + WARN_ON_ONCE(dio->flags & IOMAP_DIO_INLINE_COMP); ret = generic_write_sync(iocb, ret); + } kfree(dio); @@ -154,25 +158,43 @@ static void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + struct kiocb *iocb = dio->iocb; if (bio->bi_status) iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + if (!atomic_dec_and_test(&dio->ref)) + goto release_bio; - if (atomic_dec_and_test(&dio->ref)) { - if (dio->wait_for_completion) { - struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->submit.waiter, NULL); - blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_WRITE) { - struct inode *inode = file_inode(dio->iocb->ki_filp); + /* + * Synchronous dio, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); - } else { - iomap_dio_complete_work(&dio->aio.work); - } + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + goto release_bio; } + /* + * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline + */ + if (dio->flags & IOMAP_DIO_INLINE_COMP) { + iomap_dio_complete_work(&dio->aio.work); + goto release_bio; + } + + /* + * Async DIO completion that requires filesystem level completion work + * gets punted to a work queue to complete as the operation may require + * more IO to be issued to finalise filesystem metadata changes or + * guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, + &dio->aio.work); +release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -420,23 +442,22 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - bool wait_for_completion) + unsigned int dio_flags) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; - unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = + is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); + unsigned int iomap_flags = IOMAP_DIRECT; struct blk_plug plug; struct iomap_dio *dio; if (!count) return NULL; - if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) - return ERR_PTR(-EIO); - dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) return ERR_PTR(-ENOMEM); @@ -455,18 +476,25 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.last_queue = NULL; if (iov_iter_rw(iter) == READ) { + /* reads can always complete inline */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + if (pos >= dio->i_size) goto out_free_dio; if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { - flags |= IOMAP_WRITE; + iomap_flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; /* for data sync or sync, we need sync completion processing */ if (iocb->ki_flags & IOCB_DSYNC) dio->flags |= IOMAP_DIO_NEED_SYNC; + else if (dio_flags & IOMAP_DIO_MAY_INLINE_COMP) { + /* writes could complete inline */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + } /* * For datasync only writes, we optimistically try using FUA for @@ -483,7 +511,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ret = -EAGAIN; goto out_free_dio; } - flags |= IOMAP_NOWAIT; + iomap_flags |= IOMAP_NOWAIT; } ret = filemap_write_and_wait_range(mapping, pos, end); @@ -514,7 +542,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); do { - ret = iomap_apply(inode, pos, count, flags, ops, dio, + ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio, iomap_dio_actor); if (ret <= 0) { /* magic error code to fall back to buffered I/O */ @@ -598,11 +626,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - bool wait_for_completion) + unsigned int dio_flags) { struct iomap_dio *dio; - dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9f52365995c7bc5279e94fc5c8331ba95322e346..52643eac5d46ee692b1d171eec8effaa511e503c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -263,8 +263,7 @@ xfs_file_dio_aio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, - is_sync_kiocb(iocb)); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -651,7 +650,7 @@ xfs_file_dio_aio_write( */ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_io); + unaligned_io ? IOMAP_DIO_FORCE_WAIT : 0); out: if (iolock) xfs_iunlock(ip, iolock); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 1e53976d597537ca76402b59180e8498804d221d..bcdc14e99bb13a5a30121b15d5b3cb9106bf1a5a 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -932,7 +932,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = zonefs_file_dio_append(iocb, from); else ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, sync); + &zonefs_write_dio_ops, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -1067,7 +1067,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } file_accessed(iocb->ki_filp); ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, - &zonefs_read_dio_ops, is_sync_kiocb(iocb)); + &zonefs_read_dio_ops, 0); } else { ret = generic_file_read_iter(iocb, to); if (ret == -EIO) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 78520f28806af33c533f95e9fa8828df2bb1d05a..0965d5f12858e3d9546d0baa98ac5f128dd604f1 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -291,12 +291,24 @@ struct iomap_dio_ops { struct bio *bio, loff_t file_offset); }; +/* + * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not + * synchronous. + */ +#define IOMAP_DIO_FORCE_WAIT (1 << 0) + +/* + * DIO will be completed inline unless sync operation is needed after io is + * finished. + */ +#define IOMAP_DIO_MAY_INLINE_COMP (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - bool wait_for_completion); + unsigned int dio_flags); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - bool wait_for_completion); + unsigned int dio_flags); ssize_t iomap_dio_complete(struct iomap_dio *dio); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);