diff --git a/MAINTAINERS b/MAINTAINERS index 6ddb6b82eef5cb0a900aa6e8002b82e3eefaa9ea..9872276959714462d011271d0ea4a45604ec35a1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8064,6 +8064,15 @@ F: include/linux/fs_types.h F: include/uapi/linux/fs.h F: include/uapi/linux/openat2.h +FILESYSTEMS [STACKABLE] +M: Miklos Szeredi +M: Amir Goldstein +L: linux-fsdevel@vger.kernel.org +L: linux-unionfs@vger.kernel.org +S: Maintained +F: fs/backing-file.c +F: include/linux/backing-file.h + FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio L: linux-hwmon@vger.kernel.org diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index d5b3ed2c58f5030ab8022ac260b7ce8e172c2fa7..c380d8c30704db529a6a9e8ef7d5001ba83a0d7a 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address) */ if (vma) { char buf[ARC_PATH_MAX]; - char *nm = "?"; + char *nm = "anon"; if (vma->vm_file) { - nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1); + /* XXX: can we use %pD below and get rid of buf? */ + nm = d_path(file_user_path(vma->vm_file), buf, + ARC_PATH_MAX-1); if (IS_ERR(nm)) nm = "?"; } diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 3f9be66edece967c66e0ca25da824174d078030e..11500a2d3ba1be54accc50f854a451f0ee0f86cd 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7050,6 +7050,7 @@ CONFIG_FUSE_FS=m CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_FUSE_DAX=y +CONFIG_FUSE_PASSTHROUGH=y CONFIG_OVERLAY_FS=m # CONFIG_OVERLAY_FS_REDIRECT_DIR is not set CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y @@ -7057,7 +7058,7 @@ CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y # CONFIG_OVERLAY_FS_XINO_AUTO is not set # CONFIG_OVERLAY_FS_METACOPY is not set # CONFIG_OVERLAY_FS_DEBUG is not set - +CONFIG_FS_STACK=y # # Caches # diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 49fc7487fb0219f8e7c5027bfc4afb0ed72f7627..388e70b8491f63fb151ac1b6c6d39ffa712cbebd 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8267,6 +8267,7 @@ CONFIG_FUSE_FS=m CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_FUSE_DAX=y +CONFIG_FUSE_PASSTHROUGH=y CONFIG_OVERLAY_FS=m # CONFIG_OVERLAY_FS_REDIRECT_DIR is not set # CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set @@ -8274,7 +8275,7 @@ CONFIG_OVERLAY_FS=m # CONFIG_OVERLAY_FS_XINO_AUTO is not set # CONFIG_OVERLAY_FS_METACOPY is not set # CONFIG_OVERLAY_FS_DEBUG is not set - +CONFIG_FS_STACK=y # # Caches # diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 886c635990377921d7df844bb2dcc24d450ca746..bd0e138ea6b6d0dbd5851c2d222fc751dcdb24c9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -240,9 +240,7 @@ static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len); - file_start_write(file); bw = vfs_iter_write(file, &i, ppos, 0); - file_end_write(file); if (likely(bw == bvec->bv_len)) return 0; diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 4e4cf6c34a775c4483c4b16ad7bf6e78ef72f220..4d447520bab87d6133ba2611f37c4955e641fe5c 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -332,13 +332,11 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, } iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); - if (is_write) { - file_start_write(fd); + if (is_write) ret = vfs_iter_write(fd, &iter, &pos, 0); - file_end_write(fd); - } else { + else ret = vfs_iter_read(fd, &iter, &pos, 0); - } + if (is_write) { if (ret < 0 || ret != data_length) { pr_err("%s() write returned %d\n", __func__, ret); @@ -469,9 +467,7 @@ fd_execute_write_same(struct se_cmd *cmd) } iov_iter_bvec(&iter, ITER_SOURCE, bvec, nolb, len); - file_start_write(fd_dev->fd_file); ret = vfs_iter_write(fd_dev->fd_file, &iter, &pos, 0); - file_end_write(fd_dev->fd_file); kfree(bvec); if (ret < 0 || ret != len) { diff --git a/fs/Kconfig b/fs/Kconfig index 9765213bf01d524840a1365986c9cda8cfeb4d7a..3d1185e3e6a8599cf67a3115f8c251a5a60eab2f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +# Stackable filesystems +config FS_STACK + bool + config BUFFER_HEAD bool diff --git a/fs/Makefile b/fs/Makefile index 45e54c34c309ca6809d25feaba59388b46dddf19..81428bad22f072162bff406eeae4acca1ec2316d 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o +obj-$(CONFIG_FS_STACK) += backing-file.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ diff --git a/fs/backing-file.c b/fs/backing-file.c new file mode 100644 index 0000000000000000000000000000000000000000..e1a4b23b255b4e52da36b9a45b7c959946eec957 --- /dev/null +++ b/fs/backing-file.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common helpers for stackable filesystems and backing files. + * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. + * Copyright (C) 2023 CTERA Networks. + */ + +#include +#include +#include +#include + +#include "internal.h" + +/** + * backing_file_open - open a backing file for kernel internal use + * @user_path: path that the user reuqested to open + * @flags: open flags + * @real_path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + path_get(user_path); + *backing_file_user_path(f) = *user_path; + error = vfs_open(real_path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *, loff_t, ssize_t); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (ctx->end_write) + ctx->end_write(ctx->user_file, iocb->ki_pos, ret); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + if (!out->f_op->splice_write) + return -EINVAL; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = out->f_op->splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d61bd816808ceb8e985a3bda435933e3ef16cd5b..3d1cb018f3c4a321273749aafcdd1a7282a8f427 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4559,29 +4559,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (ret < 0) goto out_acct; - file_start_write(file); - if (iov_iter_count(&iter) == 0) { ret = 0; - goto out_end_write; + goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) - goto out_end_write; + goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0); if (ret) - goto out_end_write; + goto out_iov; kiocb.ki_pos = pos; + file_start_write(file); + ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); -out_end_write: file_end_write(file); +out_iov: kfree(iov); out_acct: if (ret > 0) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index e55c7cc6a997649ab3bcdd7b60115ac266df094c..f7a507ddd668d050c2b75af89aa4b11f7595ba68 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -268,7 +268,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - kiocb_end_write(iocb); + if (ki->was_async) + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -331,8 +332,6 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - kiocb_start_write(&ki->iocb); - get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a03b11cf788721b2d548f2b931c13561e5608fd7..15f386a5d24c35da5483f31489263b59df822727 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -2972,8 +2973,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * {read,write}_iter, which will get caps again. */ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); - ret = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, src_objlen, flags); + ret = splice_file_range(src_file, &src_off, dst_file, &dst_off, + src_objlen); /* Abort on short copies or on error */ if (ret < (long)src_objlen) { dout("Failed partial copy (%zd)\n", ret); @@ -3027,8 +3028,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, */ if (len && (len < src_ci->i_layout.object_size)) { dout("Final partial copy of %zu bytes\n", len); - bytes = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, len, flags); + bytes = splice_file_range(src_file, &src_off, dst_file, + &dst_off, len); if (bytes > 0) ret += bytes; else diff --git a/fs/coda/file.c b/fs/coda/file.c index 42346618b4ed192d0c62d268ab52b941fd204706..5e5ea5bc37018afa0cc1766ac903bdbef0c6128a 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,14 +79,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) if (ret) goto finish_write; - file_start_write(host_file); inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = inode_set_ctime_current(coda_inode); inode_unlock(coda_inode); - file_end_write(host_file); finish_write: venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), diff --git a/fs/file_table.c b/fs/file_table.c index 234284ef72a9a5b22cec70f0d45009b9b686df00..a5a3a385f24c4f10aec11c1db04c5c071d8960ec 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* Container for backing file with optional real path */ +/* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path real_path; + struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) @@ -55,11 +55,11 @@ static inline struct backing_file *backing_file(struct file *f) return container_of(f, struct backing_file, file); } -struct path *backing_file_real_path(struct file *f) +struct path *backing_file_user_path(struct file *f) { - return &backing_file(f)->real_path; + return &backing_file(f)->user_path; } -EXPORT_SYMBOL_GPL(backing_file_real_path); +EXPORT_SYMBOL_GPL(backing_file_user_path); static void file_free_rcu(struct rcu_head *head) { @@ -76,7 +76,7 @@ static inline void file_free(struct file *f) { security_file_free(f); if (unlikely(f->f_mode & FMODE_BACKING)) - path_put(backing_file_real_path(f)); + path_put(backing_file_user_path(f)); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); call_rcu(&f->f_rcuhead, file_free_rcu); diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 038ed0b9aaa5d619cf498c55ffd0069dce2abf14..8674dbfbe59dbf79c304c587b08ebba3cfe405be 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,14 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_PASSTHROUGH + bool "FUSE passthrough operations support" + default y + depends on FUSE_FS + select FS_STACK + help + This allows bypassing FUSE server by mapping specific FUSE operations + to be performed directly on a backing file. + + If you want to allow passthrough operations, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d78d20966443de6a56928020729b4..6e0228c6d0cba9541c8668efb86b83094751d469 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8573d79ef29c80c85ddaedfbb6d9f98675fcab41..3c842286cdbbde078e13754a8cd629b196920c47 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2253,43 +2253,91 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) { int res; int oldfd; struct fuse_dev *fud = NULL; struct fd f; + if (get_user(oldfd, argp)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + + fdput(f); + return res; +} + +static long fuse_dev_ioctl_backing_open(struct file *file, + struct fuse_backing_map __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_backing_map map; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (copy_from_user(&map, argp, sizeof(map))) + return -EFAULT; + + return fuse_backing_open(fud->fc, &map); +} + +static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + int backing_id; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (get_user(backing_id, argp)) + return -EFAULT; + + return fuse_backing_close(fud->fc, backing_id); +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + switch (cmd) { case FUSE_DEV_IOC_CLONE: - if (get_user(oldfd, (__u32 __user *)arg)) - return -EFAULT; + return fuse_dev_ioctl_clone(file, argp); - f = fdget(oldfd); - if (!f.file) - return -EINVAL; + case FUSE_DEV_IOC_BACKING_OPEN: + return fuse_dev_ioctl_backing_open(file, argp); + + case FUSE_DEV_IOC_BACKING_CLOSE: + return fuse_dev_ioctl_backing_close(file, argp); - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (f.file->f_op == file->f_op) - fud = fuse_get_dev(f.file); - - res = -EINVAL; - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fdput(f); - break; default: - res = -ENOTTY; - break; + return -ENOTTY; } - return res; } const struct file_operations fuse_dev_operations = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 95f9913a35373184b7b50805aa0e50ea967aa601..2b55671425295fa0dd643d576dcd0a66968123f3 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -619,7 +619,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; - struct fuse_open_out outopen; + struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; @@ -634,7 +634,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; @@ -663,8 +663,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; - args.out_args[1].size = sizeof(outopen); - args.out_args[1].value = &outopen; + /* Store outarg for fuse_finish_open() */ + outopenp = &ff->args->open_outarg; + args.out_args[1].size = sizeof(*outopenp); + args.out_args[1].value = outopenp; err = get_create_ext(&args, dir, entry, mode); if (err) @@ -680,9 +682,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_invalid_attr(&outentry.attr)) goto out_free_ff; - ff->fh = outopen.fh; + ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; - ff->open_flags = outopen.open_flags; + ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { @@ -696,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); - err = finish_open(file, entry, generic_file_open); + err = generic_file_open(inode, file); + if (!err) { + file->private_data = ff; + err = finish_open(file, entry, fuse_finish_open); + } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { - file->private_data = ff; - fuse_finish_open(inode, file); if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) @@ -1635,7 +1639,32 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + struct fuse_mount *fm = get_fuse_mount(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fm, get_node_id(inode), file, true); + if (!err) { + struct fuse_file *ff = file->private_data; + + /* + * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for + * directories for backward compatibility, though it's unlikely + * to be useful. + */ + if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) + nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } + + return err; } static int fuse_dir_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fca2be8983367d771d4d56d18aa38e276099a6ad..25d6951ef2c04f3776efa9b298a3c50b9eb781d9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -49,13 +49,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, return fuse_simple_request(fm, &args); } -struct fuse_release_args { - struct fuse_args args; - struct fuse_release_in inarg; - struct inode *inode; -}; - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -64,11 +58,12 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); @@ -84,7 +79,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) void fuse_file_free(struct fuse_file *ff) { - kfree(ff->release_args); + kfree(ff->args); mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -104,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = &ff->args->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -131,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { - struct fuse_open_out outarg; + if (open) { + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; - + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; if (isdir) fc->no_opendir = 1; else @@ -194,40 +196,50 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -249,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) - fuse_finish_open(inode, file); + err = fuse_do_open(fm, get_node_id(inode), file, false); + if (!err) { + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); @@ -273,10 +289,13 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; + + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -291,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -300,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -327,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -336,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -362,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -931,7 +951,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, put_page(page); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -1305,6 +1325,86 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return true; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. + */ + if (fuse_io_past_eof(iocb, from) || + fuse_inode_uncached_io_start(fi, NULL) != 0) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + if (exclusive) { + inode_unlock(inode); + } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_inode_uncached_io_end(fi); + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1581,49 +1681,14 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. - */ - if (fuse_direct_write_extending_i_size(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1634,10 +1699,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -1654,10 +1716,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_read_iter(iocb, to); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1672,10 +1737,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) + return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else return fuse_cache_write_iter(iocb, from); +} + +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); else - return fuse_direct_write_iter(iocb, from); + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) @@ -1690,7 +1783,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) __free_page(ap->pages[i]); if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); @@ -1938,7 +2031,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -2336,7 +2429,7 @@ static int fuse_writepages(struct address_space *mapping, fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: @@ -2491,11 +2584,27 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); + int rc; /* DAX mmap is superior to direct_io mmap */ - if (FUSE_IS_DAX(file_inode(file))) + if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ + if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) + return -ENODEV; + + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { /* * Can't provide the coherency needed for MAP_SHARED @@ -2510,6 +2619,17 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) /* MAP_PRIVATE */ return generic_file_mmap(file, vma); } + + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. + */ + rc = fuse_file_cached_io_open(inode, ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -3246,8 +3366,8 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3278,7 +3398,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1bb136bcbe9e7f6abffca0e53b34caaf2cb90467..0428658c499ed41149a9aab4b9951d8ef843d135 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -77,6 +77,16 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** Container for data related to mapping to backing file */ +struct fuse_backing { + struct file *file; + struct cred *cred; + + /** refcount */ + refcount_t count; + struct rcu_head rcu; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -112,7 +122,7 @@ struct fuse_inode { u64 attr_version; union { - /* Write related fields (regular file only) */ + /* read/write io cache (regular file only) */ struct { /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; @@ -124,9 +134,15 @@ struct fuse_inode { * (FUSE_NOWRITE) means more writes are blocked */ int writectr; + /** Number of files/maps using page cache */ + int iocachectr; + /* Waitq for writepage completion */ wait_queue_head_t page_waitq; + /* waitq for direct-io completion */ + wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ struct rb_root writepages; }; @@ -174,6 +190,10 @@ struct fuse_inode { #endif /** Submount specific lookup tracking */ struct fuse_submount_lookup *submount_lookup; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct fuse_backing *fb; +#endif KABI_RESERVE(1) }; @@ -190,19 +210,21 @@ enum { FUSE_I_BAD, /* Has btime */ FUSE_I_BTIME, + /* Wants or already has page cache IO */ + FUSE_I_CACHE_IO_MODE, }; struct fuse_conn; struct fuse_mount; -struct fuse_release_args; +union fuse_file_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_mount *fm; - /* Argument space reserved for release */ - struct fuse_release_args *release_args; + /* Argument space reserved for open/release */ + union fuse_file_args *args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -247,6 +269,15 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + /** Does file hold a fi->iocachectr refcount? */ + enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct file *passthrough; + const struct cred *cred; +#endif + /** Has flock been performed on this file? */ bool flock:1; }; @@ -301,6 +332,19 @@ struct fuse_args_pages { unsigned int num_pages; }; +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + +union fuse_file_args { + /* Used during open() */ + struct fuse_open_out open_outarg; + /* Used during release() */ + struct fuse_release_args release_args; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ @@ -829,6 +873,12 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /** Passthrough support for read/write IO */ + unsigned int passthrough:1; + + /** Maximum stack depth for passthrough backing files */ + int max_stack_depth; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -879,6 +929,11 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** IDR for backing files ids */ + struct idr backing_files_map; +#endif + KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -1047,14 +1102,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); -/** - * Send OPEN or OPENDIR request - */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +int fuse_finish_open(struct inode *inode, struct file *file); void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags); @@ -1364,11 +1414,83 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); -/* file.c */ +/* iomode.c */ +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); +int fuse_inode_uncached_io_start(struct fuse_inode *fi, + struct fuse_backing *fb); +void fuse_inode_uncached_io_end(struct fuse_inode *fi); +int fuse_file_io_open(struct file *file, struct inode *inode); +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); + +/* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +#ifdef CONFIG_FUSE_PASSTHROUGH +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); +void fuse_backing_put(struct fuse_backing *fb); +#else + +static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + return NULL; +} + +static inline void fuse_backing_put(struct fuse_backing *fb) +{ +} +#endif + +void fuse_backing_files_init(struct fuse_conn *fc); +void fuse_backing_files_free(struct fuse_conn *fc); +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); +int fuse_backing_close(struct fuse_conn *fc, int backing_id); + +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id); +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); + +static inline struct file *fuse_file_passthrough(struct fuse_file *ff) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return ff->passthrough; +#else + return NULL; +#endif +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 735abf426a0640f2be5df6a2f275fc9bda9d8482..d771ce993e3e0eb7dceccc9830d5d44686d4ed17 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) goto out_free_forget; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_inode_backing_set(fi, NULL); + return &fi->inode; out_free_forget: @@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode) #ifdef CONFIG_FUSE_DAX kfree(fi->dax); #endif + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_put(fuse_inode_backing(fi)); + kmem_cache_free(fuse_inode_cachep, fi); } @@ -169,6 +175,7 @@ static void fuse_evict_inode(struct inode *inode) } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } @@ -946,6 +953,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_init(fc); + INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; @@ -976,6 +986,8 @@ void fuse_conn_put(struct fuse_conn *fc) WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); call_rcu(&fc->rcu, delayed_release); } } @@ -1312,6 +1324,29 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; + /* + * max_stack_depth is the max stack depth of FUSE fs, + * so it has to be at least 1 to support passthrough + * to backing files. + * + * with max_stack_depth > 1, the backing files can be + * on a stacked fs (e.g. overlayfs) themselves and with + * max_stack_depth == 1, FUSE fs can be stacked as the + * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. + */ + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && + (flags & FUSE_PASSTHROUGH) && + arg->max_stack_depth > 0 && + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { + fc->passthrough = 1; + fc->max_stack_depth = arg->max_stack_depth; + fm->sb->s_stack_depth = arg->max_stack_depth; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1367,6 +1402,8 @@ void fuse_send_init(struct fuse_mount *fm) #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + flags |= FUSE_PASSTHROUGH; ia->in.flags = flags; ia->in.flags2 = flags >> 32; diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c new file mode 100644 index 0000000000000000000000000000000000000000..c99e285f3183ef92f7662ac84956ad1a03315ea9 --- /dev/null +++ b/fs/fuse/iomode.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE inode io modes. + * + * Copyright (c) 2024 CTERA Networks. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include + +/* + * Return true if need to wait for new opens in caching mode. + */ +static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) +{ + return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); +} + +/* + * Called on cached file open() and on first mmap() of direct_io file. + * Takes cached_io inode mode reference to be dropped on file release. + * + * Blocks new parallel dio writes and waits for the in-progress parallel dio + * writes to complete. + */ +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* There are no io modes if server does not implement open */ + if (!ff->args) + return 0; + + spin_lock(&fi->lock); + /* + * Setting the bit advises new direct-io writes to use an exclusive + * lock - without it the wait below might be forever. + */ + while (fuse_is_io_cache_wait(fi)) { + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); + spin_lock(&fi->lock); + } + + /* + * Check if inode entered passthrough io mode while waiting for parallel + * dio write completion. + */ + if (fuse_inode_backing(fi)) { + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + return -ETXTBSY; + } + + WARN_ON(ff->iomode == IOM_UNCACHED); + if (ff->iomode == IOM_NONE) { + ff->iomode = IOM_CACHED; + if (fi->iocachectr == 0) + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + fi->iocachectr++; + } + spin_unlock(&fi->lock); + return 0; +} + +static void fuse_file_cached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr <= 0); + WARN_ON(ff->iomode != IOM_CACHED); + ff->iomode = IOM_NONE; + fi->iocachectr--; + if (fi->iocachectr == 0) + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); +} + +/* Start strictly uncached io mode where cache access is not allowed */ +int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) +{ + struct fuse_backing *oldfb; + int err = 0; + + spin_lock(&fi->lock); + /* deny conflicting backing files on same fuse inode */ + oldfb = fuse_inode_backing(fi); + if (fb && oldfb && oldfb != fb) { + err = -EBUSY; + goto unlock; + } + if (fi->iocachectr > 0) { + err = -ETXTBSY; + goto unlock; + } + fi->iocachectr--; + + /* fuse inode holds a single refcount of backing file */ + if (fb && !oldfb) { + oldfb = fuse_inode_backing_set(fi, fb); + WARN_ON_ONCE(oldfb != NULL); + } else { + fuse_backing_put(fb); + } +unlock: + spin_unlock(&fi->lock); + return err; +} + +/* Takes uncached_io inode mode reference to be dropped on file release */ +static int fuse_file_uncached_io_open(struct inode *inode, + struct fuse_file *ff, + struct fuse_backing *fb) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + err = fuse_inode_uncached_io_start(fi, fb); + if (err) + return err; + + WARN_ON(ff->iomode != IOM_NONE); + ff->iomode = IOM_UNCACHED; + return 0; +} + +void fuse_inode_uncached_io_end(struct fuse_inode *fi) +{ + struct fuse_backing *oldfb = NULL; + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr >= 0); + fi->iocachectr++; + if (!fi->iocachectr) { + wake_up(&fi->direct_io_waitq); + oldfb = fuse_inode_backing_set(fi, NULL); + } + spin_unlock(&fi->lock); + if (oldfb) + fuse_backing_put(oldfb); +} + +/* Drop uncached_io reference from passthrough open */ +static void fuse_file_uncached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fuse_inode_uncached_io_end(fi); +} + +/* + * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. + * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write + * operations go directly to the server, but mmap is done on the backing file. + * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode + * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. + */ +#define FOPEN_PASSTHROUGH_MASK \ + (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ + FOPEN_NOFLUSH) + +static int fuse_file_passthrough_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_backing *fb; + int err; + + /* Check allowed conditions for file open in passthrough mode */ + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || + (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) + return -EINVAL; + + fb = fuse_passthrough_open(file, inode, + ff->args->open_outarg.backing_id); + if (IS_ERR(fb)) + return PTR_ERR(fb); + + /* First passthrough file open denies caching inode io mode */ + err = fuse_file_uncached_io_open(inode, ff, fb); + if (!err) + return 0; + + fuse_passthrough_release(ff, fb); + fuse_backing_put(fb); + + return err; +} + +/* Request access to submit new io to inode via open file */ +int fuse_file_io_open(struct file *file, struct inode *inode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + /* + * io modes are not relevant with DAX and with server that does not + * implement open. + */ + if (FUSE_IS_DAX(inode) || !ff->args) + return 0; + + /* + * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode + * which is already open for passthrough. + */ + err = -EINVAL; + if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) + goto fail; + + /* + * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. + */ + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; + + /* + * First passthrough file open denies caching inode io mode. + * First caching file open enters caching inode io mode. + * + * Note that if user opens a file open with O_DIRECT, but server did + * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, + * so we put the inode in caching mode to prevent parallel dio. + */ + if ((ff->open_flags & FOPEN_DIRECT_IO) && + !(ff->open_flags & FOPEN_PASSTHROUGH)) + return 0; + + if (ff->open_flags & FOPEN_PASSTHROUGH) + err = fuse_file_passthrough_open(inode, file); + else + err = fuse_file_cached_io_open(inode, ff); + if (err) + goto fail; + + return 0; + +fail: + pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", + ff->open_flags, err); + /* + * The file open mode determines the inode io mode. + * Using incorrect open mode is a server mistake, which results in + * user visible failure of open() with EIO error. + */ + return -EIO; +} + +/* No more pending io and no new io possible to inode via open/mmapped file */ +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * Last passthrough file close allows caching inode io mode. + * Last caching file close exits caching inode io mode. + */ + switch (ff->iomode) { + case IOM_NONE: + /* Nothing to do */ + break; + case IOM_UNCACHED: + fuse_file_uncached_io_release(ff, fi); + break; + case IOM_CACHED: + fuse_file_cached_io_release(ff, fi); + break; + } +} diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c new file mode 100644 index 0000000000000000000000000000000000000000..d1b570d39501cce719928d8a3ed36214070a1c1b --- /dev/null +++ b/fs/fuse/passthrough.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include +#include +#include + +static void fuse_file_accessed(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_atime(inode); +} + +static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret) +{ + struct inode *inode = file_inode(file); + + fuse_write_update_attr(inode, pos, ret); +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + + return ret; +} + +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .end_write = fuse_passthrough_end_write, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + inode_lock(inode); + ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = in, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + return backing_file_splice_read(backing_file, ppos, pipe, len, flags, + &ctx); +} + +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct inode *inode = file_inode(out); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = out, + .end_write = fuse_passthrough_end_write, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + inode_lock(inode); + ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__, + backing_file, vma->vm_start, vma->vm_end); + + return backing_file_mmap(backing_file, vma, &ctx); +} + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget(map->fd); + res = -EBADF; + if (!file) + goto out; + + res = -EOPNOTSUPP; + if (!file->f_op->read_iter || !file->f_op->write_iter) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +/* + * Setup passthrough to a backing file. + * + * Returns an fb object with elevated refcount to be stored in fuse inode. + */ +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + struct fuse_backing *fb = NULL; + struct file *backing_file; + int err; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + err = -ENOENT; + if (!fb) + goto out; + + /* Allocate backing file per fuse file to store fuse path */ + backing_file = backing_file_open(&file->f_path, file->f_flags, + &fb->file->f_path, fb->cred); + err = PTR_ERR(backing_file); + if (IS_ERR(backing_file)) { + fuse_backing_put(fb); + goto out; + } + + err = 0; + ff->passthrough = backing_file; + ff->cred = get_cred(fb->cred); +out: + pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__, + backing_id, fb, ff->passthrough, err); + + return err ? ERR_PTR(err) : fb; +} + +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__, + fb, ff->passthrough); + + fput(ff->passthrough); + ff->passthrough = NULL; + put_cred(ff->cred); + ff->cred = NULL; +} diff --git a/fs/inode.c b/fs/inode.c index 46a31aba793382913d01f79f97e6d99e1cdb91df..ad7445342ee9f7dd26bae2cd68c5b9c56fef87d4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2052,7 +2052,7 @@ void touch_atime(const struct path *path) if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt) != 0) + if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -2064,7 +2064,7 @@ void touch_atime(const struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } @@ -2177,9 +2177,9 @@ static int __file_update_time(struct file *file, int sync_mode) struct inode *inode = file_inode(file); /* try to update time settings */ - if (!__mnt_want_write_file(file)) { + if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); } return ret; diff --git a/fs/internal.h b/fs/internal.h index d64ae03998cce9dcffdad33baa3e52f4f7dc8983..273e6fd40d1be26482c4a191d136c2391666b813 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write_file(struct file *); +int mnt_get_write_access_file(struct file *file); +void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); @@ -95,13 +95,20 @@ struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); +static inline void file_put_write_access(struct file *file) +{ + put_write_access(file->f_inode); + mnt_put_write_access(file->f_path.mnt); + if (unlikely(file->f_mode & FMODE_BACKING)) + mnt_put_write_access(backing_file_user_path(file)->mnt); +} + static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { - put_write_access(file->f_inode); - __mnt_drop_write(file->f_path.mnt); + file_put_write_access(file); } } @@ -130,9 +137,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb) * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in - * __mnt_want_write() before the mnt_is_readonly() check. The barrier - * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already - * cleared, it will see s_readonly_remount set. + * mnt_get_write_access() before the mnt_is_readonly() check. + * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD + * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } diff --git a/fs/namespace.c b/fs/namespace.c index 190fbb026ff00b77d548853219658e563f78f08e..012c3c4461e7cbc30cbd848e6fb0086280fc7ae9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -320,16 +320,16 @@ static int mnt_is_readonly(struct vfsmount *mnt) * can determine when writes are able to occur to a filesystem. */ /** - * __mnt_want_write - get write access to a mount without freeze protection + * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being - * frozen. When the write operation is finished, __mnt_drop_write() must be + * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ -int __mnt_want_write(struct vfsmount *m) +int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -376,6 +376,7 @@ int __mnt_want_write(struct vfsmount *m) return ret; } +EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount @@ -391,7 +392,7 @@ int mnt_want_write(struct vfsmount *m) int ret; sb_start_write(m->mnt_sb); - ret = __mnt_want_write(m); + ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; @@ -399,15 +400,15 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * __mnt_want_write_file - get write access to a file's mount + * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but if the file is already open for writing it + * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be - * paired with __mnt_drop_write_file. + * paired with mnt_put_write_access_file. */ -int __mnt_want_write_file(struct file *file) +int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* @@ -418,7 +419,7 @@ int __mnt_want_write_file(struct file *file) return -EROFS; return 0; } - return __mnt_want_write(file->f_path.mnt); + return mnt_get_write_access(file->f_path.mnt); } /** @@ -435,7 +436,7 @@ int mnt_want_write_file(struct file *file) int ret; sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); + ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; @@ -443,19 +444,20 @@ int mnt_want_write_file(struct file *file) EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * __mnt_drop_write - give up write access to a mount + * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * __mnt_want_write() call above. + * mnt_get_write_access() call above. */ -void __mnt_drop_write(struct vfsmount *mnt) +void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } +EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount @@ -467,20 +469,20 @@ void __mnt_drop_write(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); -void __mnt_drop_write_file(struct file *file) +void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) - __mnt_drop_write(file->f_path.mnt); + mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b3e51d88faff46b21b24cd4cbeddf03c6e14f858..f65bd7b686d85a4057e1379f698d33aee15f6d8e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1025,7 +1025,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + host_err = rw_verify_area(READ, file, &offset, *count); + if (!host_err) + host_err = splice_direct_to_actor(file, &sd, + nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1162,9 +1165,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); - file_start_write(file); host_err = vfs_iter_write(file, &iter, &pos, flags); - file_end_write(file); if (host_err < 0) { nfsd_reset_write_verifier(nn); trace_nfsd_writeverf_reset(nn, rqstp, host_err); diff --git a/fs/open.c b/fs/open.c index f9ac703ec1b2d368cb4f92905b4a5654cb53ad0c..4679db501d432d770909b91ca16cc1778af76c1d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) @@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -1158,45 +1177,6 @@ struct file *kernel_file_open(const struct path *path, int flags, } EXPORT_SYMBOL_GPL(kernel_file_open); -/** - * backing_file_open - open a backing file for kernel internal use - * @path: path of the file to open - * @flags: open flags - * @real_path: path of the backing file - * @cred: credentials for open - * - * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @path may be on the stackable filesystem and backing inode on the - * underlying filesystem. In this case, we want to be able to return - * the @real_path of the backing inode. This is done by embedding the - * returned file into a container structure that also stores the path of - * the backing inode on the underlying filesystem, which can be - * retrieved using backing_file_real_path(). - */ -struct file *backing_file_open(const struct path *path, int flags, - const struct path *real_path, - const struct cred *cred) -{ - struct file *f; - int error; - - f = alloc_empty_backing_file(flags, cred); - if (IS_ERR(f)) - return f; - - f->f_path = *path; - path_get(real_path); - *backing_file_real_path(f) = *real_path; - error = do_dentry_open(f, d_inode(real_path->dentry), NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } - - return f; -} -EXPORT_SYMBOL_GPL(backing_file_open); - #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index fec5020c3495039e0eda0c865aaf10b989779588..2ac67e04a6fbede386e3a8e0008a1ad1de2d3f9b 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config OVERLAY_FS tristate "Overlay filesystem support" + select FS_STACK select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index dbf7b3cd70ca5e0ebf860b426335ea4c34cb798a..c8578e4853f01a7b18c7597760d9cd14176de3d6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -230,6 +230,19 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, return ovl_real_fileattr_set(new, &newfa); } +static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) +{ + loff_t tmp; + + if (pos != pos2) + return -EIO; + if (pos < 0 || len < 0 || totlen < 0) + return -EIO; + if (check_add_overflow(pos, len, &tmp)) + return -EIO; + return 0; +} + static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, struct file *new_file, loff_t len) { @@ -244,7 +257,8 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, int error = 0; ovl_path_lowerdata(dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) + if (WARN_ON_ONCE(datapath.dentry == NULL) || + WARN_ON_ONCE(len < 0)) return -EIO; old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); @@ -252,10 +266,16 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, return PTR_ERR(old_file); /* Try to use clone_file_range to clone up within the same fs */ - cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); + cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) goto out_fput; + /* Couldn't clone, so now we try to copy the data */ + error = rw_verify_area(READ, old_file, &old_pos, len); + if (!error) + error = rw_verify_area(WRITE, new_file, &new_pos, len); + if (error) + goto out_fput; /* Check if lower fs supports seek operation */ if (old_file->f_mode & FMODE_LSEEK) @@ -287,8 +307,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, * it may not recognize all kind of holes and sometimes * only skips partial of hole area. However, it will be * enough for most of the use cases. + * + * We do not hold upper sb_writers throughout the loop to avert + * lockdep warning with llseek of lower file in nested overlay: + * - upper sb_writers + * -- lower ovl_inode_lock (ovl_llseek) */ - if (skip_hole && data_pos < old_pos) { data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); if (data_pos > old_pos) { @@ -303,6 +327,10 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, } } + error = ovl_verify_area(old_pos, new_pos, this_len, len); + if (error) + break; + bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); @@ -426,29 +454,28 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, return ERR_PTR(err); } -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, - struct dentry *upper) +struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) { - const struct ovl_fh *fh = NULL; - int err; - /* * When lower layer doesn't support export operations store a 'null' fh, * so we can use the overlay.origin xattr to distignuish between a copy * up and a pure upper inode. */ - if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_real_fh(ofs, d_inode(lower), false); - if (IS_ERR(fh)) - return PTR_ERR(fh); - } + if (!ovl_can_decode_fh(origin->d_sb)) + return NULL; + + return ovl_encode_real_fh(ofs, d_inode(origin), false); +} +int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, + struct dentry *upper) +{ + int err; /* * Do not fail when upper doesn't support xattrs. */ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf, fh ? fh->fb.len : 0, 0); - kfree(fh); /* Ignore -EPERM from setting "user.*" on symlink/special */ return err == -EPERM ? 0 : err; @@ -476,7 +503,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, * * Caller must hold i_mutex on indexdir. */ -static int ovl_create_index(struct dentry *dentry, struct dentry *origin, +static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, struct dentry *upper) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); @@ -502,7 +529,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) return -EIO; - err = ovl_get_index_name(ofs, origin, &name); + err = ovl_get_index_name_fh(fh, &name); if (err) return err; @@ -541,6 +568,7 @@ struct ovl_copy_up_ctx { struct dentry *destdir; struct qstr destname; struct dentry *workdir; + const struct ovl_fh *origin_fh; bool origin; bool indexed; bool metacopy; @@ -555,14 +583,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(upperdir); + ovl_start_write(c->dentry); + /* Mark parent "impure" because it may now contain non-pure upper */ err = ovl_set_impure(c->parent, upperdir); if (err) - return err; + goto out; err = ovl_set_nlink_lower(c->dentry); if (err) - return err; + goto out; inode_lock_nested(udir, I_MUTEX_PARENT); upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, @@ -581,10 +611,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) } inode_unlock(udir); if (err) - return err; + goto out; err = ovl_set_nlink_upper(c->dentry); +out: + ovl_end_write(c->dentry); return err; } @@ -637,7 +669,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) * hard link. */ if (c->origin) { - err = ovl_set_origin(ofs, c->lowerpath.dentry, temp); + err = ovl_set_origin_fh(ofs, c->origin_fh, temp); if (err) return err; } @@ -719,21 +751,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) .link = c->link }; - /* workdir and destdir could be the same when copying up to indexdir */ - err = -EIO; - if (lock_rename(c->workdir, c->destdir) != NULL) - goto unlock; - err = ovl_prep_cu_creds(c->dentry, &cc); if (err) - goto unlock; + return err; + ovl_start_write(c->dentry); + inode_lock(wdir); temp = ovl_create_temp(ofs, c->workdir, &cattr); + inode_unlock(wdir); + ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); - err = PTR_ERR(temp); if (IS_ERR(temp)) - goto unlock; + return PTR_ERR(temp); /* * Copy up data first and then xattrs. Writing data after @@ -741,15 +771,29 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) */ path.dentry = temp; err = ovl_copy_up_data(c, &path); - if (err) + /* + * We cannot hold lock_rename() throughout this helper, because of + * lock ordering with sb_writers, which shouldn't be held when calling + * ovl_copy_up_data(), so lock workdir and destdir and make sure that + * temp wasn't moved before copy up completion or cleanup. + */ + ovl_start_write(c->dentry); + if (lock_rename(c->workdir, c->destdir) != NULL || + temp->d_parent != c->workdir) { + /* temp or workdir moved underneath us? abort without cleanup */ + dput(temp); + err = -EIO; + goto unlock; + } else if (err) { goto cleanup; + } err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; if (S_ISDIR(c->stat.mode) && c->indexed) { - err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); + err = ovl_create_index(c->dentry, c->origin_fh, temp); if (err) goto cleanup; } @@ -779,6 +823,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) ovl_set_flag(OVL_WHITEOUTS, inode); unlock: unlock_rename(c->workdir, c->destdir); + ovl_end_write(c->dentry); return err; @@ -802,9 +847,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; + ovl_start_write(c->dentry); tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); - if (IS_ERR(tmpfile)) return PTR_ERR(tmpfile); @@ -815,9 +861,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) goto out_fput; } + ovl_start_write(c->dentry); + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_fput; + goto out; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -831,7 +879,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_fput; + goto out; if (c->metacopy_digest) ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); @@ -843,6 +891,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) ovl_set_upperdata(d_inode(c->dentry)); ovl_inode_update(d_inode(c->dentry), dget(temp)); +out: + ovl_end_write(c->dentry); out_fput: fput(tmpfile); return err; @@ -861,6 +911,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) { int err; struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct dentry *origin = c->lowerpath.dentry; + struct ovl_fh *fh = NULL; bool to_index = false; /* @@ -877,25 +929,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) to_index = true; } - if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) + if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) { + fh = ovl_get_origin_fh(ofs, origin); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + /* origin_fh may be NULL */ + c->origin_fh = fh; c->origin = true; + } if (to_index) { c->destdir = ovl_indexdir(c->dentry->d_sb); - err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname); + err = ovl_get_index_name(ofs, origin, &c->destname); if (err) - return err; + goto out_free_fh; } else if (WARN_ON(!c->parent)) { /* Disconnected dentry must be copied up to index dir */ - return -EIO; + err = -EIO; + goto out_free_fh; } else { /* * Mark parent "impure" because it may now contain non-pure * upper */ + ovl_start_write(c->dentry); err = ovl_set_impure(c->parent, c->destdir); + ovl_end_write(c->dentry); if (err) - return err; + goto out_free_fh; } /* Should we copyup with O_TMPFILE or with workdir? */ @@ -909,6 +971,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) if (c->indexed) ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); + ovl_start_write(c->dentry); if (to_index) { /* Initialize nlink for copy up of disconnected dentry */ err = ovl_set_nlink_upper(c->dentry); @@ -923,10 +986,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry)); } + ovl_end_write(c->dentry); out: if (to_index) kfree(c->destname.name); +out_free_fh: + kfree(fh); return err; } @@ -1011,15 +1077,16 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * Writing to upper file will clear security.capability xattr. We * don't want that to happen for normal copy-up operation. */ + ovl_start_write(c->dentry); if (capability) { err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS, capability, cap_size, 0); - if (err) - goto out_free; } - - - err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); + if (!err) { + err = ovl_removexattr(ofs, upperpath.dentry, + OVL_XATTR_METACOPY); + } + ovl_end_write(c->dentry); if (err) goto out_free; @@ -1170,17 +1237,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) int ovl_maybe_copy_up(struct dentry *dentry, int flags) { - int err = 0; - - if (ovl_open_need_copy_up(dentry, flags)) { - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up_flags(dentry, flags); - ovl_drop_write(dentry); - } - } + if (!ovl_open_need_copy_up(dentry, flags)) + return 0; - return err; + return ovl_copy_up_flags(dentry, flags); } int ovl_copy_up_with_data(struct dentry *dentry) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 54602f0bed8be0f2a8ea5d047f89cb9267190074..68cfb2959f75ad23fb900713da51f0e09a5564d4 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -556,10 +556,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct cred *override_cred; struct dentry *parent = dentry->d_parent; - err = ovl_copy_up(parent); - if (err) - return err; - old_cred = ovl_override_creds(dentry->d_sb); /* @@ -623,6 +619,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, .link = link, }; + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + err = ovl_want_write(dentry); if (err) goto out; @@ -697,28 +697,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir, int err; struct inode *inode; - err = ovl_want_write(old); + err = ovl_copy_up(old); if (err) goto out; - err = ovl_copy_up(old); + err = ovl_copy_up(new->d_parent); if (err) - goto out_drop_write; + goto out; - err = ovl_copy_up(new->d_parent); + err = ovl_nlink_start(old); if (err) - goto out_drop_write; + goto out; if (ovl_is_metacopy_dentry(old)) { err = ovl_set_link_redirect(old); if (err) - goto out_drop_write; + goto out_nlink_end; } - err = ovl_nlink_start(old); - if (err) - goto out_drop_write; - inode = d_inode(old); ihold(inode); @@ -728,9 +724,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) iput(inode); +out_nlink_end: ovl_nlink_end(old); -out_drop_write: - ovl_drop_write(old); out: return err; } @@ -888,17 +883,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out; } - err = ovl_want_write(dentry); - if (err) - goto out; - err = ovl_copy_up(dentry->d_parent); if (err) - goto out_drop_write; + goto out; err = ovl_nlink_start(dentry); if (err) - goto out_drop_write; + goto out; old_cred = ovl_override_creds(dentry->d_sb); if (!lower_positive) @@ -923,8 +914,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (ovl_dentry_upper(dentry)) ovl_copyattr(d_inode(dentry)); -out_drop_write: - ovl_drop_write(dentry); out: ovl_cache_free(&list); return err; @@ -1128,29 +1117,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } - err = ovl_want_write(old); - if (err) - goto out; - err = ovl_copy_up(old); if (err) - goto out_drop_write; + goto out; err = ovl_copy_up(new->d_parent); if (err) - goto out_drop_write; + goto out; if (!overwrite) { err = ovl_copy_up(new); if (err) - goto out_drop_write; + goto out; } else if (d_inode(new)) { err = ovl_nlink_start(new); if (err) - goto out_drop_write; + goto out; update_nlink = true; } + if (!update_nlink) { + /* ovl_nlink_start() took ovl_want_write() */ + err = ovl_want_write(old); + if (err) + goto out; + } + old_cred = ovl_override_creds(old->d_sb); if (!list_empty(&list)) { @@ -1283,8 +1275,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, revert_creds(old_cred); if (update_nlink) ovl_nlink_end(new); -out_drop_write: - ovl_drop_write(old); + else + ovl_drop_write(old); out: dput(opaquedir); ovl_cache_free(&list); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 3a17e4366f28c0e612b1a805915da50a22a7fd03..19df4efbf14896d02443e38355709041d3ae0cd0 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry) if (ovl_dentry_upper(dentry)) return 0; - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up(dentry); - ovl_drop_write(dentry); - } - + err = ovl_copy_up(dentry); if (err) { pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 8be4dc050d1ed2e5508f62e84be2d5a835990d36..5c5bf9822f27c6e04355810bff401ba3a2c20d49 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,20 +9,11 @@ #include #include #include -#include #include -#include #include +#include #include "overlayfs.h" -struct ovl_aio_req { - struct kiocb iocb; - refcount_t ref; - struct kiocb *orig_iocb; -}; - -static struct kmem_cache *ovl_aio_request_cachep; - static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -235,6 +226,17 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) return ret; } +static void ovl_file_modified(struct file *file) +{ + /* Update size/mtime */ + ovl_copyattr(file_inode(file)); +} + +static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret) +{ + ovl_file_modified(file); +} + static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; @@ -260,62 +262,16 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -static rwf_t ovl_iocb_to_rwf(int ifl) -{ - rwf_t flags = 0; - - if (ifl & IOCB_NOWAIT) - flags |= RWF_NOWAIT; - if (ifl & IOCB_HIPRI) - flags |= RWF_HIPRI; - if (ifl & IOCB_DSYNC) - flags |= RWF_DSYNC; - if (ifl & IOCB_SYNC) - flags |= RWF_SYNC; - - return flags; -} - -static inline void ovl_aio_put(struct ovl_aio_req *aio_req) -{ - if (refcount_dec_and_test(&aio_req->ref)) { - fput(aio_req->iocb.ki_filp); - kmem_cache_free(ovl_aio_request_cachep, aio_req); - } -} - -static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) -{ - struct kiocb *iocb = &aio_req->iocb; - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(orig_iocb->ki_filp); - - kiocb_end_write(iocb); - ovl_copyattr(inode); - } - - orig_iocb->ki_pos = iocb->ki_pos; - ovl_aio_put(aio_req); -} - -static void ovl_aio_rw_complete(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res); -} - static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct fd real; - const struct cred *old_cred; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; if (!iov_iter_count(iter)) return 0; @@ -324,36 +280,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb->ki_flags)); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); - ovl_file_accessed(file); -out_fdput: + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, + &ctx); fdput(real); return ret; @@ -364,9 +292,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; - const struct cred *old_cred; ssize_t ret; int ifl = iocb->ki_flags; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = file, + .end_write = ovl_file_end_write, + }; if (!iov_iter_count(iter)) return 0; @@ -374,19 +306,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(file); - if (ret) - goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -395,37 +319,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(ifl)); - file_end_write(real.file); - /* Update size */ - ovl_copyattr(inode); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - kiocb_start_write(&aio_req->iocb); - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); -out_fdput: + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); fdput(real); out_unlock: @@ -438,20 +332,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - const struct cred *old_cred; struct fd real; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(in)->i_sb), + .user_file = in, + .accessed = ovl_file_accessed, + }; ret = ovl_real_fdget(in, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - ovl_file_accessed(in); - + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); fdput(real); + return ret; } @@ -467,30 +362,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fd real; - const struct cred *old_cred; struct inode *inode = file_inode(out); ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = out, + .end_write = ovl_file_end_write, + }; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(out); - if (ret) - goto out_unlock; ret = ovl_real_fdget(out, &real); if (ret) goto out_unlock; - old_cred = ovl_override_creds(inode->i_sb); - file_start_write(real.file); - - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - - file_end_write(real.file); - /* Update size */ - ovl_copyattr(inode); - revert_creds(old_cred); + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); fdput(real); out_unlock: @@ -528,23 +416,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct file *realfile = file->private_data; - const struct cred *old_cred; - int ret; - - if (!realfile->f_op->mmap) - return -ENODEV; - - if (WARN_ON(file != vma->vm_file)) - return -EIO; - - vma_set_file(vma, realfile); + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); - ovl_file_accessed(file); - - return ret; + return backing_file_mmap(realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) @@ -570,7 +448,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len revert_creds(old_cred); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(file); fdput(real); @@ -654,7 +532,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, revert_creds(old_cred); /* Update size */ - ovl_copyattr(inode_out); + ovl_file_modified(file_out); fdput(real_in); fdput(real_out); @@ -737,19 +615,3 @@ const struct file_operations ovl_file_operations = { .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; - -int __init ovl_aio_request_cache_init(void) -{ - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", - sizeof(struct ovl_aio_req), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ovl_aio_request_cachep) - return -ENOMEM; - - return 0; -} - -void ovl_aio_request_cache_destroy(void) -{ - kmem_cache_destroy(ovl_aio_request_cachep); -} diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 9c42a30317d587a5018a448a747995b9c4def954..64a18913b1d6e5207d8fb04946417037ad4eb720 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -32,10 +32,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (err) return err; - err = ovl_want_write(dentry); - if (err) - goto out; - if (attr->ia_valid & ATTR_SIZE) { /* Truncate should trigger data copy up as well */ full_copy_up = true; @@ -54,7 +50,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, winode = d_inode(upperdentry); err = get_write_access(winode); if (err) - goto out_drop_write; + goto out; } if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) @@ -78,6 +74,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ attr->ia_valid &= ~ATTR_OPEN; + err = ovl_want_write(dentry); + if (err) + goto out_put_write; + inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = ovl_do_notify_change(ofs, upperdentry, attr); @@ -85,12 +85,12 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); + ovl_drop_write(dentry); +out_put_write: if (winode) put_write_access(winode); } -out_drop_write: - ovl_drop_write(dentry); out: return err; } @@ -361,27 +361,27 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, struct path realpath; const struct cred *old_cred; - err = ovl_want_write(dentry); - if (err) - goto out; - if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) - goto out_drop_write; + goto out; } if (!upperdentry) { err = ovl_copy_up(dentry); if (err) - goto out_drop_write; + goto out; realdentry = ovl_dentry_upper(dentry); } + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(dentry->d_sb); if (value) { err = ovl_do_setxattr(ofs, realdentry, name, value, size, @@ -391,12 +391,10 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, err = ovl_do_removexattr(ofs, realdentry, name); } revert_creds(old_cred); + ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); - -out_drop_write: - ovl_drop_write(dentry); out: return err; } @@ -611,10 +609,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); - err = ovl_want_write(dentry); - if (err) - return err; - /* * If ACL is to be removed from a lower file, check if it exists in * the first place before copying it up. @@ -630,7 +624,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, revert_creds(old_cred); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); - goto out_drop_write; + goto out; } posix_acl_release(real_acl); } @@ -638,23 +632,26 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, if (!upperdentry) { err = ovl_copy_up(dentry); if (err) - goto out_drop_write; + goto out; realdentry = ovl_dentry_upper(dentry); } + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(dentry->d_sb); if (acl) err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else err = ovl_do_remove_acl(ofs, realdentry, acl_name); revert_creds(old_cred); + ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); - -out_drop_write: - ovl_drop_write(dentry); +out: return err; } @@ -782,14 +779,14 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, unsigned int flags; int err; - err = ovl_want_write(dentry); - if (err) - goto out; - err = ovl_copy_up(dentry); if (!err) { ovl_path_real(dentry, &upperpath); + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(inode->i_sb); /* * Store immutable/append-only flags in xattr and clear them @@ -802,6 +799,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, if (!err) err = ovl_real_fileattr_set(&upperpath, fa); revert_creds(old_cred); + ovl_drop_write(dentry); /* * Merge real inode flags with inode flags read from @@ -816,7 +814,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, /* Update ctime */ ovl_copyattr(inode); } - ovl_drop_write(dentry); out: return err; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 273a39d3e95133e62aa20666e1d40503708bffbd..2d2ef671b36ba5c270bb9721457a5a187d85f4be 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -507,6 +507,19 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, return err; } +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const struct ovl_fh *fh, + bool is_upper, bool set) +{ + int err; + + err = ovl_verify_fh(ofs, dentry, ox, fh); + if (set && err == -ENODATA) + err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + + return err; +} + /* * Verify that @real dentry matches the file handle stored in xattr @name. * @@ -515,9 +528,9 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ -int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, struct dentry *real, bool is_upper, - bool set) +int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, + bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; @@ -530,9 +543,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, goto fail; } - err = ovl_verify_fh(ofs, dentry, ox, fh); - if (set && err == -ENODATA) - err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set); if (err) goto fail; @@ -548,6 +559,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, goto out; } + /* Get upper dentry from index */ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected) @@ -684,7 +696,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto out; } -static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) +int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name) { char *n, *s; @@ -873,20 +885,27 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, struct dentry *lower, struct dentry *upper) { + const struct ovl_fh *fh; int err; if (ovl_check_origin_xattr(ofs, upper)) return 0; + fh = ovl_get_origin_fh(ofs, lower); + if (IS_ERR(fh)) + return PTR_ERR(fh); + err = ovl_want_write(dentry); if (err) - return err; + goto out; - err = ovl_set_origin(ofs, lower, upper); + err = ovl_set_origin_fh(ofs, fh, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); ovl_drop_write(dentry); +out: + kfree(fh); return err; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 981967e507b3e1c8f0698e814e3eb96877e5ce07..616d4f2e0fd82cdcb95590c4304d607dae135686 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -406,10 +406,20 @@ static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, } /* util.c */ +int ovl_get_write_access(struct dentry *dentry); +void ovl_put_write_access(struct dentry *dentry); +void ovl_start_write(struct dentry *dentry); +void ovl_end_write(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); + +static inline const struct cred *ovl_creds(struct super_block *sb) +{ + return OVL_FS(sb)->creator_cred; +} + int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -632,11 +642,15 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, struct dentry *real, bool is_upper, - bool set); + enum ovl_xattr ox, const struct ovl_fh *fh, + bool is_upper, bool set); +int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, + bool is_upper, bool set); struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); +int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); @@ -648,17 +662,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); +static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper, + const struct ovl_fh *fh, bool set) +{ + return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set); +} + static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool set) { - return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin, - false, set); + return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin, + false, set); } static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, struct dentry *upper, bool set) { - return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set); + return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper, + true, set); } /* readdir.c */ @@ -807,8 +828,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int __init ovl_aio_request_cache_init(void); -void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); @@ -823,8 +842,9 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentr int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper); -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, - struct dentry *upper); +struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); +int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, + struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 2c056d737c27c3492938f6b9a18b5e73282e1853..65ea8fc5b670880b221988feb0877125210e7099 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct dentry *real = NULL, *lower; int err; - /* It's an overlay file */ + /* + * vfs is only expected to call d_real() with NULL from d_real_inode() + * and with overlay inode from file_dentry() on an overlay file. + * + * TODO: remove @inode argument from d_real() API, remove code in this + * function that deals with non-NULL @inode and remove d_real() call + * from file_dentry(). + */ if (inode && d_inode(dentry) == inode) return dentry; + else if (inode) + goto bug; if (!d_is_reg(dentry)) { - if (!inode || inode == d_inode(dentry)) - return dentry; - goto bug; + /* d_real_inode() is only relevant for regular files */ + return dentry; } real = ovl_dentry_upper(dentry); @@ -879,15 +887,20 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *indexdir; + struct dentry *origin = ovl_lowerstack(oe)->dentry; + const struct ovl_fh *fh; int err; + fh = ovl_get_origin_fh(ofs, origin); + if (IS_ERR(fh)) + return PTR_ERR(fh); + err = mnt_want_write(mnt); if (err) - return err; + goto out_free_fh; /* Verify lower root is upper root origin */ - err = ovl_verify_origin(ofs, upperpath->dentry, - ovl_lowerstack(oe)->dentry, true); + err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true); if (err) { pr_err("failed to verify upper root origin\n"); goto out; @@ -919,9 +932,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, * directory entries. */ if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { - err = ovl_verify_set_fh(ofs, ofs->indexdir, - OVL_XATTR_ORIGIN, - upperpath->dentry, true, false); + err = ovl_verify_origin_xattr(ofs, ofs->indexdir, + OVL_XATTR_ORIGIN, + upperpath->dentry, true, + false); if (err) pr_err("failed to verify index dir 'origin' xattr\n"); } @@ -939,6 +953,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, out: mnt_drop_write(mnt); +out_free_fh: + kfree(fh); return err; } @@ -1540,14 +1556,10 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = ovl_aio_request_cache_init(); - if (!err) { - err = register_filesystem(&ovl_fs_type); - if (!err) - return 0; + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; - ovl_aio_request_cache_destroy(); - } kmem_cache_destroy(ovl_inode_cachep); return err; @@ -1563,7 +1575,6 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0bf3ffcd072f6a1783008b8aa5a519483d71c698..cecdd7f2ac38d77010eb96fe908dd2cc8592828b 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -17,12 +17,38 @@ #include #include "overlayfs.h" +/* Get write access to upper mnt - may fail if upper sb was remounted ro */ +int ovl_get_write_access(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + return mnt_get_write_access(ovl_upper_mnt(ofs)); +} + +/* Get write access to upper sb - may block if upper sb is frozen */ +void ovl_start_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + sb_start_write(ovl_upper_mnt(ofs)->mnt_sb); +} + int ovl_want_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return mnt_want_write(ovl_upper_mnt(ofs)); } +void ovl_put_write_access(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + mnt_put_write_access(ovl_upper_mnt(ofs)); +} + +void ovl_end_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + sb_end_write(ovl_upper_mnt(ofs)->mnt_sb); +} + void ovl_drop_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); @@ -647,22 +673,36 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags) return false; } +/* + * The copy up "transaction" keeps an elevated mnt write count on upper mnt, + * but leaves taking freeze protection on upper sb to lower level helpers. + */ int ovl_copy_up_start(struct dentry *dentry, int flags) { struct inode *inode = d_inode(dentry); int err; err = ovl_inode_lock_interruptible(inode); - if (!err && ovl_already_copied_up_locked(dentry, flags)) { + if (err) + return err; + + if (ovl_already_copied_up_locked(dentry, flags)) err = 1; /* Already copied up */ - ovl_inode_unlock(inode); - } + else + err = ovl_get_write_access(dentry); + if (err) + goto out_unlock; + + return 0; +out_unlock: + ovl_inode_unlock(inode); return err; } void ovl_copy_up_end(struct dentry *dentry) { + ovl_put_write_access(dentry); ovl_inode_unlock(d_inode(dentry)); } @@ -976,12 +1016,18 @@ static void ovl_cleanup_index(struct dentry *dentry) struct dentry *index = NULL; struct inode *inode; struct qstr name = { }; + bool got_write = false; int err; err = ovl_get_index_name(ofs, lowerdentry, &name); if (err) goto fail; + err = ovl_want_write(dentry); + if (err) + goto fail; + + got_write = true; inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", @@ -1019,6 +1065,8 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; out: + if (got_write) + ovl_drop_write(dentry); kfree(name.name); dput(index); return; @@ -1065,8 +1113,12 @@ int ovl_nlink_start(struct dentry *dentry) if (err) return err; + err = ovl_want_write(dentry); + if (err) + goto out_unlock; + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) - goto out; + return 0; old_cred = ovl_override_creds(dentry->d_sb); /* @@ -1077,10 +1129,15 @@ int ovl_nlink_start(struct dentry *dentry) */ err = ovl_set_nlink_upper(dentry); revert_creds(old_cred); - -out: if (err) - ovl_inode_unlock(inode); + goto out_drop_write; + + return 0; + +out_drop_write: + ovl_drop_write(dentry); +out_unlock: + ovl_inode_unlock(inode); return err; } @@ -1089,6 +1146,8 @@ void ovl_nlink_end(struct dentry *dentry) { struct inode *inode = d_inode(dentry); + ovl_drop_write(dentry); + if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { const struct cred *old_cred; @@ -1406,6 +1465,7 @@ void ovl_copyattr(struct inode *inode) realinode = ovl_i_path_real(inode, &realpath); real_idmap = mnt_idmap(realpath.mnt); + spin_lock(&inode->i_lock); vfsuid = i_uid_into_vfsuid(real_idmap, realinode); vfsgid = i_gid_into_vfsgid(real_idmap, realinode); @@ -1416,4 +1476,5 @@ void ovl_copyattr(struct inode *inode) inode->i_mtime = realinode->i_mtime; inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); + spin_unlock(&inode->i_lock); } diff --git a/fs/proc/base.c b/fs/proc/base.c index ac79ba23c0cbd8f63460614ab9b7bd9b67714f5d..250e2c66348f9e29aa4f11f7a5332bf7a3050808 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2281,7 +2281,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 4d3493579458f04e6ede75b24cd605ca2905fabe..c6e7ebc637562d80271ec84181dcbfa812e94518 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ab12eb479c1f1248f1eb78643901d1dcbb536018..46b4c39a12dbcee7d15eb72a05294b3051502151 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -297,7 +297,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (anon_name) seq_printf(m, "[anon_shmem:%s]", anon_name->name); else - seq_file_path(m, file, "\n"); + seq_path(m, file_user_path(file), "\n"); goto done; } @@ -1976,7 +1976,7 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); + seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 7cebd397cc26e1056d3533643c98f87a2ae5fd79..bce6745330003174ed2161e31978377477cfbd70 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); diff --git a/fs/read_write.c b/fs/read_write.c index dd5c90675f51952e89e3a863054a5f1e8e51d04f..27a7729b82f36bf90c88a55b1aa96d984ff33fc5 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -775,12 +775,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) +ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; + if (!file->f_op->read_iter) + return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) @@ -789,22 +791,20 @@ static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(READ, file, pos, tot_len); + ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; - if (file->f_op->read_iter) - ret = do_iter_readv_writev(file, iter, pos, READ, flags); - else - ret = do_loop_readv_writev(file, iter, pos, READ, flags); + ret = call_read_iter(file, iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } +EXPORT_SYMBOL(vfs_iocb_iter_read); -ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, - struct iov_iter *iter) +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, + rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -819,33 +819,30 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); + ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; - ret = call_read_iter(file, iocb, iter); + ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } -EXPORT_SYMBOL(vfs_iocb_iter_read); - -ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) -{ - if (!file->f_op->read_iter) - return -EINVAL; - return do_iter_read(file, iter, ppos, flags); -} EXPORT_SYMBOL(vfs_iter_read); -static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) +/* + * Caller is responsible for calling kiocb_end_write() on completion + * if async iocb was queued. + */ +ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; + if (!file->f_op->write_iter) + return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) @@ -854,88 +851,127 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, tot_len = iov_iter_count(iter); if (!tot_len) return 0; - ret = rw_verify_area(WRITE, file, pos, tot_len); + ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; - if (file->f_op->write_iter) - ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); - else - ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); + kiocb_start_write(iocb); + ret = call_write_iter(file, iocb, iter); + if (ret != -EIOCBQUEUED) + kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); + return ret; } +EXPORT_SYMBOL(vfs_iocb_iter_write); -ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, - struct iov_iter *iter) +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, + rwf_t flags) { size_t tot_len; - ssize_t ret = 0; + ssize_t ret; - if (!file->f_op->write_iter) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; + if (!file->f_op->write_iter) + return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; - ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); + + ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; - ret = call_write_iter(file, iocb, iter); + file_start_write(file); + ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); + file_end_write(file); return ret; } -EXPORT_SYMBOL(vfs_iocb_iter_write); - -ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) -{ - if (!file->f_op->write_iter) - return -EINVAL; - return do_iter_write(file, iter, ppos, flags); -} EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + + ret = rw_verify_area(READ, file, pos, tot_len); + if (ret < 0) + goto out; + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, &iter, pos, READ, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, READ, flags); +out: + if (ret >= 0) + fsnotify_access(file); + kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + + ret = rw_verify_area(WRITE, file, pos, tot_len); + if (ret < 0) + goto out; + + file_start_write(file); + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); + file_end_write(file); +out: + kfree(iov); return ret; } @@ -1252,10 +1288,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; - file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); - file_end_write(out.file); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; @@ -1389,10 +1423,12 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { - lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); + /* May only be called from within ->copy_file_range() methods */ + if (WARN_ON_ONCE(flags)) + return -EINVAL; - return do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + return splice_file_range(file_in, &pos_in, file_out, &pos_out, + min_t(size_t, len, MAX_RW_COUNT)); } EXPORT_SYMBOL(generic_copy_file_range); @@ -1480,6 +1516,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; + bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; @@ -1511,19 +1548,24 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - goto done; - } - - if (!splice && file_in->f_op->remap_file_range && - file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { + } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (ret > 0) - goto done; + /* fallback to splice */ + if (ret <= 0) + splice = true; + } else if (samesb) { + /* Fallback to splice for same sb copy for backward compat */ + splice = true; } + file_end_write(file_out); + + if (!splice) + goto done; + /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in @@ -1535,11 +1577,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * - * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE + * for server-side-copy between any two sb. + * + * In any case, we call do_splice_direct() and not splice_file_range(), + * without file_start_write() held, to avoid possible deadlocks related + * to splicing from input file, while file_start_write() is held on + * the output file on a different sb. */ - ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); @@ -1551,8 +1598,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, inc_syscr(current); inc_syscw(current); - file_end_write(file_out); - return ret; } EXPORT_SYMBOL(vfs_copy_file_range); diff --git a/fs/remap_range.c b/fs/remap_range.c index 87ae4f0dc3aa01c6099ef2fa7a66b5d84bdb9703..cd4dbc11125f168d43cff4e527f99981bf3e8724 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -367,9 +367,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_remap_file_range_prep); -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; @@ -393,8 +393,10 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; + file_start_write(file_out); ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); + file_end_write(file_out); if (ret < 0) return ret; @@ -402,25 +404,10 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, fsnotify_modify(file_out); return ret; } -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); - file_end_write(file_out); - - return ret; -} EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) +static bool may_dedupe_file(struct file *file) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -445,24 +432,29 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)); - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - /* * This is redundant if called from vfs_dedupe_file_range(), but other * callers need it and it's not performance sesitive... */ ret = remap_verify_area(src_file, src_pos, len, false); if (ret) - goto out_drop_write; + return ret; ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret) - goto out_drop_write; + return ret; + + /* + * This needs to be called after remap_verify_area() because of + * sb_start_write() and before may_dedupe_file() because the mount's + * MAY_WRITE need to be checked with mnt_get_write_access_file() held. + */ + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; ret = -EPERM; - if (!allow_file_dedupe(dst_file)) + if (!may_dedupe_file(dst_file)) goto out_drop_write; ret = -EXDEV; diff --git a/fs/splice.c b/fs/splice.c index d983d375ff1130b5b054241612f56d4510d65589..7cda013e5a1ef1f5781f6e1d6774c8865a30ab0d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -673,10 +673,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; int nbufs = pipe->max_usage; - struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), - GFP_KERNEL); + struct bio_vec *array; ssize_t ret; + if (!out->f_op->write_iter) + return -EINVAL; + + array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; @@ -684,6 +687,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { + struct kiocb kiocb; struct iov_iter from; unsigned int head, tail, mask; size_t left; @@ -733,7 +737,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos, 0); + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + ret = call_write_iter(out, &kiocb, &from); + sd.pos = kiocb.ki_pos; if (ret <= 0) break; @@ -944,27 +951,15 @@ static void do_splice_eof(struct splice_desc *sd) sd->splice_eof(sd); } -/** - * vfs_splice_read - Read data from a file and splice it into a pipe - * @in: File to splice from - * @ppos: Input file offset - * @pipe: Pipe to splice to - * @len: Number of bytes to splice - * @flags: Splice modifier flags (SPLICE_F_*) - * - * Splice the requested amount of data from the input file to the pipe. This - * is synchronous as the caller must hold the pipe lock across the entire - * operation. - * - * If successful, it returns the amount of data spliced, 0 if it hit the EOF or - * a hole and a negative error code otherwise. +/* + * Callers already called rw_verify_area() on the entire range. + * No need to call it for sub ranges. */ -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static long do_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { unsigned int p_space; - int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -975,10 +970,6 @@ long vfs_splice_read(struct file *in, loff_t *ppos, p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); - ret = rw_verify_area(READ, in, ppos, len); - if (unlikely(ret < 0)) - return ret; - if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; @@ -992,6 +983,34 @@ long vfs_splice_read(struct file *in, loff_t *ppos, return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } + +/** + * vfs_splice_read - Read data from a file and splice it into a pipe + * @in: File to splice from + * @ppos: Input file offset + * @pipe: Pipe to splice to + * @len: Number of bytes to splice + * @flags: Splice modifier flags (SPLICE_F_*) + * + * Splice the requested amount of data from the input file to the pipe. This + * is synchronous as the caller must hold the pipe lock across the entire + * operation. + * + * If successful, it returns the amount of data spliced, 0 if it hit the EOF or + * a hole and a negative error code otherwise. + */ +long vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + int ret; + + ret = rw_verify_area(READ, in, ppos, len); + if (unlikely(ret < 0)) + return ret; + + return do_splice_read(in, ppos, pipe, len, flags); +} EXPORT_SYMBOL_GPL(vfs_splice_read); /** @@ -1066,7 +1085,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = vfs_splice_read(in, &pos, pipe, len, flags); + ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; @@ -1138,9 +1157,20 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; + long ret; + + file_start_write(file); + ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); + file_end_write(file); + return ret; +} + +static int splice_file_range_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; - return do_splice_from(pipe, file, sd->opos, sd->total_len, - sd->flags); + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) @@ -1151,24 +1181,10 @@ static void direct_file_splice_eof(struct splice_desc *sd) file->f_op->splice_eof(file); } -/** - * do_splice_direct - splices data directly between two files - * @in: file to splice from - * @ppos: input file offset - * @out: file to splice to - * @opos: output file offset - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * For use by do_sendfile(). splice can easily emulate sendfile, but - * doing it in the application would incur an extra system call - * (splice in + splice out, as compared to just sendfile()). So this helper - * can splice directly through a process-private pipe. - * - */ -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags) +static long do_splice_direct_actor(struct file *in, loff_t *ppos, + struct file *out, loff_t *opos, + size_t len, unsigned int flags, + splice_direct_actor *actor) { struct splice_desc sd = { .len = len, @@ -1187,18 +1203,62 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; - ret = rw_verify_area(WRITE, out, opos, len); - if (unlikely(ret < 0)) - return ret; - - ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + * Callers already called rw_verify_area() on the entire range. + */ +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags) +{ + return do_splice_direct_actor(in, ppos, out, opos, len, flags, + direct_splice_actor); +} EXPORT_SYMBOL(do_splice_direct); +/** + * splice_file_range - splices data between two files for copy_file_range() + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * + * Description: + * For use by generic_copy_file_range() and ->copy_file_range() methods. + * Like do_splice_direct(), but vfs_copy_file_range() already holds + * start_file_write() on @out file. + * + * Callers already called rw_verify_area() on the entire range. + */ +long splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len) +{ + lockdep_assert(file_write_started(out)); + + return do_splice_direct_actor(in, ppos, out, opos, len, 0, + splice_file_range_actor); +} +EXPORT_SYMBOL(splice_file_range); + static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { @@ -1230,7 +1290,7 @@ long splice_file_to_pipe(struct file *in, pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) - ret = vfs_splice_read(in, offset, opipe, len, flags); + ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); @@ -1307,6 +1367,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, offset = in->f_pos; } + ret = rw_verify_area(READ, in, &offset, len); + if (unlikely(ret < 0)) + return ret; + if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; diff --git a/fs/super.c b/fs/super.c index 42c9754e3add4f888e83067281a350d4d1843744..14862ca3500af8bee835d378a20ea24c58391b20 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2160,3 +2160,4 @@ int sb_init_dio_done_wq(struct super_block *sb) destroy_workqueue(wq); return 0; } +EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h new file mode 100644 index 0000000000000000000000000000000000000000..98e0b6c30193377eb18d1b9cc95edc9de7d0b1c9 --- /dev/null +++ b/include/linux/backing-file.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common helpers for stackable filesystems and backing files. + * + * Copyright (C) 2023 CTERA Networks. + */ + +#ifndef _LINUX_BACKING_FILE_H +#define _LINUX_BACKING_FILE_H + +#include +#include +#include + +struct backing_file_ctx { + const struct cred *cred; + struct file *user_file; + void (*accessed)(struct file *); + void (*end_write)(struct file *, loff_t, ssize_t); +}; + +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred); +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx); + +#endif /* _LINUX_BACKING_FILE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e7aeb5ff463d817fa1423ce1800f40dd767ec23..bd3e4902f2f43123b48933ee0ead31b8ddff860c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1689,9 +1689,70 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ static inline bool sb_write_started(const struct super_block *sb) { - return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); + return __sb_write_started(sb, SB_FREEZE_WRITE); +} + +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + +/** + * file_write_started - check if SB_FREEZE_WRITE is held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_started(file_inode(file)->i_sb); +} + +/** + * file_write_not_started - check if SB_FREEZE_WRITE is not held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_not_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_not_started(file_inode(file)->i_sb); } /** @@ -2103,9 +2164,6 @@ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); -extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); @@ -2593,24 +2651,21 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct file *backing_file_open(const struct path *path, int flags, - const struct path *real_path, - const struct cred *cred); -struct path *backing_file_real_path(struct file *f); +struct path *backing_file_user_path(struct file *f); /* - * file_real_path - get the path corresponding to f_inode + * file_user_path - get the path to display for memory mapped file * - * When opening a backing file for a stackable filesystem (e.g., - * overlayfs) f_path may be on the stackable filesystem and f_inode on - * the underlying filesystem. When the path associated with f_inode is - * needed, this helper should be used instead of accessing f_path - * directly. -*/ -static inline const struct path *file_real_path(struct file *f) + * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file + * stored in ->vm_file is a backing file whose f_inode is on the underlying + * filesystem. When the mapped file path is displayed to user (e.g. via + * /proc//maps), this helper should be used to get the path to display + * to the user, which is the path of the fd that user has requested to map. + */ +static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) - return backing_file_real_path(f); + return backing_file_user_path(f); return &f->f_path; } @@ -3078,8 +3133,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); extern void diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 0dea8d0fdb0b8cb41e95858e13b9f28e1754d5b8..912b58bb66c548a6e3d7393c52d88571a773f31b 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -102,8 +102,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) return 0; - /* Overlayfs internal files have fake f_path */ - path = file_real_path(file); + path = &file->f_path; return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/mount.h b/include/linux/mount.h index 4f40b40306d0f9679ca3be2615a03656196a501b..ac3dd287619784dfbeb9d3859d6d2287a7d9d423 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -extern int __mnt_want_write(struct vfsmount *); -extern void __mnt_drop_write(struct vfsmount *); +int mnt_get_write_access(struct vfsmount *mnt); +void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); diff --git a/include/linux/splice.h b/include/linux/splice.h index 6c461573434ddecd8c8b8889c02dde3abeacb6de..49532d5dda52362554b279412ec1afb6f14c59d8 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -80,11 +80,14 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, long vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, - splice_direct_actor *); -extern long do_splice(struct file *in, loff_t *off_in, - struct file *out, loff_t *off_out, - size_t len, unsigned int flags); +ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, + splice_direct_actor *actor); +long do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags); +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); +long splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len); extern long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e7418d15fe3906507827025545dcd5348202ff30..1162a47b6a42d709ac6f3c9fac01783be29d9eb5 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -211,6 +211,10 @@ * 7.39 * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures + * + * 7.40 + * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag + * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag */ #ifndef _LINUX_FUSE_H @@ -246,7 +250,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 39 +#define FUSE_KERNEL_MINOR_VERSION 40 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -353,6 +357,7 @@ struct fuse_file_lock { * FOPEN_STREAM: the file is stream-like (no file position at all) * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode + * FOPEN_PASSTHROUGH: passthrough read/write io for this open file */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) @@ -361,6 +366,7 @@ struct fuse_file_lock { #define FOPEN_STREAM (1 << 4) #define FOPEN_NOFLUSH (1 << 5) #define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) +#define FOPEN_PASSTHROUGH (1 << 7) /** * INIT request/reply flags @@ -449,6 +455,7 @@ struct fuse_file_lock { #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) +#define FUSE_PASSTHROUGH (1ULL << 37) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP @@ -761,7 +768,7 @@ struct fuse_create_in { struct fuse_open_out { uint64_t fh; uint32_t open_flags; - uint32_t padding; + int32_t backing_id; }; struct fuse_release_in { @@ -877,7 +884,8 @@ struct fuse_init_out { uint16_t max_pages; uint16_t map_alignment; uint32_t flags2; - uint32_t unused[7]; + uint32_t max_stack_depth; + uint32_t unused[6]; }; #define CUSE_INIT_INFO_MAX 4096 @@ -1049,9 +1057,18 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_backing_map { + int32_t fd; + uint32_t flags; + uint64_t padding; +}; + /* Device ioctls: */ #define FUSE_DEV_IOC_MAGIC 229 #define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) +#define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ + struct fuse_backing_map) +#define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) struct fuse_lseek_in { uint64_t fh; diff --git a/kernel/acct.c b/kernel/acct.c index a58a284f1f38dea19d58e0c915a85989a68d1ba4..84e151e8a65effbc599d5ff196d4cc7d8474d967 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -269,7 +269,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = __mnt_want_write(internal); + err = mnt_get_write_access(internal); if (err) { mntput(internal); kfree(acct); @@ -294,7 +294,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); mntput(mnt); return 0; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 2b948d35fb59ea8b22c935dbca6d1fd2b27a10e9..70ba8571425ce3fd8c04073c85b69af500c6bb77 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -408,7 +408,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, vmstart = vma->vm_start; } if (file) { - ret = trace_seq_path(s, &file->f_path); + ret = trace_seq_path(s, file_user_path(file)); if (ret) trace_seq_printf(s, "[+0x%lx]", ip - vmstart);