diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst index a321b84eccaac0f66221836250073fb20f083fbd..297228f9f29902ff067e66a6e08b41db5e125a65 100644 --- a/Documentation/admin-guide/sysctl/fs.rst +++ b/Documentation/admin-guide/sysctl/fs.rst @@ -332,3 +332,13 @@ Each "watch" costs roughly 90 bytes on a 32-bit kernel, and roughly 160 bytes on a 64-bit one. The current default value for ``max_user_watches`` is 4% of the available low memory, divided by the "watch" cost in bytes. + +5. /proc/sys/fs/fuse - Configuration options for FUSE filesystems +===================================================================== + +This directory contains the following configuration options for FUSE +filesystems: + +``/proc/sys/fs/fuse/max_pages_limit`` is a read/write file for +setting/getting the maximum number of pages that can be used for servicing +requests in FUSE. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6e0228c6d0cba9541c8668efb86b83094751d469..dabc852a7ff1d331023b7efa6f28dcf3a67b43c7 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -11,5 +11,6 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_SYSCTL) += sysctl.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index ab62e46242568a5e82c55ec546333d8f1d90d161..1bf928e277fecf55c81c7e0c992cca61b934014b 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - down_read(&fc->killsb); - spin_lock(&fc->bg_lock); - fc->congestion_threshold = val; - spin_unlock(&fc->bg_lock); - up_read(&fc->killsb); + WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3c842286cdbbde078e13754a8cd629b196920c47..0d9c6e604cd96187e76d7c5596dde33b05c82690 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -29,8 +29,12 @@ MODULE_ALIAS("devname:fuse"); #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +#define DEFAULT_BG_QUEUE READ + static struct kmem_cache *fuse_req_cachep; +static void end_requests(struct list_head *head); + static struct fuse_dev *fuse_get_dev(struct file *file) { /* @@ -192,11 +196,22 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -u64 fuse_get_unique(struct fuse_iqueue *fiq) +static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } + +u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + u64 ret; + + spin_lock(&fiq->lock); + ret = fuse_get_unique_locked(fiq); + spin_unlock(&fiq->lock); + + return ret; +} EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) @@ -215,22 +230,69 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + fuse_dev_wake_and_unlock(fiq); + } else { + kfree(forget); + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from fuse_request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + } else { + fuse_dev_wake_and_unlock(fiq); + } + } else { + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + list_add_tail(&req->list, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); + req->out.h.error = -ENOTCONN; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); + } +} + const struct fuse_iqueue_ops fuse_dev_fiq_ops = { - .wake_forget_and_unlock = fuse_dev_wake_and_unlock, - .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, - .wake_pending_and_unlock = fuse_dev_wake_and_unlock, + .send_forget = fuse_dev_queue_forget, + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_dev_queue_req, }; EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); -static void queue_request_and_unlock(struct fuse_iqueue *fiq, - struct fuse_req *req) -__releases(fiq->lock) +static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - list_add_tail(&req->list, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + fiq->ops->send_req(fiq, req); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -241,31 +303,71 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->lock); - if (fiq->connected) { - fiq->forget_list_tail->next = forget; - fiq->forget_list_tail = forget; - fiq->ops->wake_forget_and_unlock(fiq); + fiq->ops->send_forget(fiq, forget); +} + +static void fuse_add_bg_queue(struct fuse_conn *fc, struct fuse_req *req) +{ + if (fc->separate_background) { + if (req->args->opcode == FUSE_WRITE) + list_add_tail(&req->list, &fc->bg_queue[WRITE]); + else + list_add_tail(&req->list, &fc->bg_queue[READ]); } else { - kfree(forget); - spin_unlock(&fiq->lock); + /* default to one single background queue */ + list_add_tail(&req->list, &fc->bg_queue[DEFAULT_BG_QUEUE]); } } -static void flush_bg_queue(struct fuse_conn *fc) +static void fuse_dec_active_bg(struct fuse_conn *fc, struct fuse_req *req) +{ + if (fc->separate_background) { + if (req->args->opcode == FUSE_WRITE) + fc->active_background[WRITE]--; + else + fc->active_background[READ]--; + } else { + /* default to one single count */ + fc->active_background[DEFAULT_BG_QUEUE]--; + } +} + +/* bg_queue needs to be further flushed when true returned */ +static bool do_flush_bg_queue(struct fuse_conn *fc, unsigned int index, + unsigned int batch) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_req *req; + unsigned int count = 0; + + while (fc->active_background[index] < fc->max_background && + !list_empty(&fc->bg_queue[index])) { + if (batch && count++ == batch) + return true; + req = list_first_entry(&fc->bg_queue[index], + struct fuse_req, list); + list_del(&req->list); + fc->active_background[index]++; + fuse_send_one(fiq, req); + } + return false; +} - while (fc->active_background < fc->max_background && - !list_empty(&fc->bg_queue)) { - struct fuse_req *req; +static void flush_bg_queue(struct fuse_conn *fc) +{ + if (!fc->separate_background) { + do_flush_bg_queue(fc, DEFAULT_BG_QUEUE, 0); + } else { + bool proceed_write = true, proceed_other = true; - req = list_first_entry(&fc->bg_queue, struct fuse_req, list); - list_del(&req->list); - fc->active_background++; - spin_lock(&fiq->lock); - req->in.h.unique = fuse_get_unique(fiq); - queue_request_and_unlock(fiq, req); + do { + if (proceed_other) + proceed_other = do_flush_bg_queue(fc, READ, + FUSE_DEFAULT_MAX_BACKGROUND); + if (proceed_write) + proceed_write = do_flush_bg_queue(fc, WRITE, + FUSE_DEFAULT_MAX_BACKGROUND); + } while (proceed_other || proceed_write); } } @@ -316,7 +418,7 @@ void fuse_request_end(struct fuse_req *req) } fc->num_background--; - fc->active_background--; + fuse_dec_active_bg(fc, req); flush_bg_queue(fc); spin_unlock(&fc->bg_lock); } else { @@ -335,29 +437,12 @@ static int queue_interrupt(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; - spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ - if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->lock); + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) return -EINVAL; - } - if (list_empty(&req->intr_entry)) { - list_add_tail(&req->intr_entry, &fiq->interrupts); - /* - * Pairs with smp_mb() implied by test_and_set_bit() - * from fuse_request_end(). - */ - smp_mb(); - if (test_bit(FR_FINISHED, &req->flags)) { - list_del_init(&req->intr_entry); - spin_unlock(&fiq->lock); - return 0; - } - fiq->ops->wake_interrupt_and_unlock(fiq); - } else { - spin_unlock(&fiq->lock); - } + fiq->ops->send_interrupt(fiq, req); + return 0; } @@ -412,21 +497,15 @@ static void __fuse_request_send(struct fuse_req *req) struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->lock); - if (!fiq->connected) { - spin_unlock(&fiq->lock); - req->out.h.error = -ENOTCONN; - } else { - req->in.h.unique = fuse_get_unique(fiq); - /* acquire extra reference, since request is still needed - after fuse_request_end() */ - __fuse_get_request(req); - queue_request_and_unlock(fiq, req); - request_wait_answer(req); - /* Pairs with smp_wmb() in fuse_request_end() */ - smp_rmb(); - } + /* acquire extra reference, since request is still needed after + fuse_request_end() */ + __fuse_get_request(req); + fuse_send_one(fiq, req); + + request_wait_answer(req); + /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); } static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) @@ -538,7 +617,7 @@ static bool fuse_request_queue_background(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - list_add_tail(&req->list, &fc->bg_queue); + fuse_add_bg_queue(fc, req); flush_bg_queue(fc); queued = true; } @@ -581,7 +660,6 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, { struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - int err = 0; req = fuse_get_req(fm, false); if (IS_ERR(req)) @@ -592,16 +670,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, fuse_args_to_req(req, args); - spin_lock(&fiq->lock); - if (fiq->connected) { - queue_request_and_unlock(fiq, req); - } else { - err = -ENODEV; - spin_unlock(&fiq->lock); - fuse_put_request(req); - } + fuse_send_one(fiq, req); - return err; + return 0; } /* @@ -1076,9 +1147,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp) +static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1097,7 +1168,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, return head; } -EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1112,7 +1182,7 @@ __releases(fiq->lock) struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1143,7 +1213,7 @@ __releases(fiq->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1777,6 +1847,68 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, return err; } +/* + * Resending all processing queue requests. + * + * During a FUSE daemon panics and failover, it is possible for some inflight + * requests to be lost and never returned. As a result, applications awaiting + * replies would become stuck forever. To address this, we can use notification + * to trigger resending of these pending requests to the FUSE daemon, ensuring + * they are properly processed again. + * + * Please note that this strategy is applicable only to idempotent requests or + * if the FUSE daemon takes careful measures to avoid processing duplicated + * non-idempotent requests. + */ +static void fuse_resend(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + struct fuse_req *req, *next; + struct fuse_iqueue *fiq = &fc->iq; + LIST_HEAD(to_queue); + unsigned int i; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], &to_queue); + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + list_for_each_entry_safe(req, next, &to_queue, list) { + set_bit(FR_PENDING, &req->flags); + /* mark the request as resend request */ + req->in.h.unique |= FUSE_UNIQUE_RESEND; + } + + spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + list_for_each_entry(req, &to_queue, list) + clear_bit(FR_PENDING, &req->flags); + end_requests(&to_queue); + return; + } + /* iq and pq requests are both oldest to newest */ + list_splice(&to_queue, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); +} + +static int fuse_notify_resend(struct fuse_conn *fc) +{ + fuse_resend(fc); + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1802,6 +1934,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); + case FUSE_NOTIFY_RESEND: + return fuse_notify_resend(fc); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 25d6951ef2c04f3776efa9b298a3c50b9eb781d9..4c1922e8c3f867ed87f5474dc8ab7e6f4d0ce136 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -448,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, /* * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. */ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to) @@ -458,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, struct fuse_inode *fi = get_fuse_inode(inode); bool found; + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + spin_lock(&fi->lock); found = fuse_find_writeback(fi, idx_from, idx_to); spin_unlock(&fi->lock); @@ -982,6 +982,11 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) if (fm->fc->async_read) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; + /* force background request to avoid starvation from writeback */ + if (fm->fc->separate_background) { + ap->args.force = true; + ap->args.nocreds = true; + } err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; @@ -995,12 +1000,17 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int i, max_pages, nr_pages = 0; + pgoff_t first = readahead_index(rac); + pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; + wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); + max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1027,8 +1037,6 @@ static void fuse_readahead(struct readahead_control *rac) ap = &ia->ap; nr_pages = __readahead_batch(rac, ap->pages, nr_pages); for (i = 0; i < nr_pages; i++) { - fuse_wait_on_page_writeback(inode, - readahead_index(rac) + i); ap->descs[i].length = PAGE_SIZE; } ap->num_pages = nr_pages; @@ -2300,6 +2308,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) return true; + /* Reached alignment boundary */ + if (fc->write_alignment && !(page->index % fc->write_align_pages)) + return true; + return false; } @@ -2733,10 +2745,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0428658c499ed41149a9aab4b9951d8ef843d135..665e89d8ea5b8558fac076ce16a40cf073404bbf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -36,8 +36,8 @@ /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 -/** Maximum of max_pages received in init_out */ -#define FUSE_MAX_MAX_PAGES 256 +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -48,6 +48,9 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/** Maximum of max_pages received in init_out */ +extern unsigned int fuse_max_pages_limit; + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -460,22 +463,19 @@ struct fuse_iqueue; */ struct fuse_iqueue_ops { /** - * Signal that a forget has been queued + * Send one forget */ - void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link); /** - * Signal that an INTERRUPT request has been queued + * Send interrupt for request */ - void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req); /** - * Signal that a request has been queued + * Send one request */ - void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req); /** * Clean up when fuse_iqueue is destroyed @@ -640,6 +640,9 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; + /* Maxmum number of pages that write request should be aligned with */ + unsigned int write_align_pages; + /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; @@ -664,11 +667,18 @@ struct fuse_conn { /** Number of requests currently in the background */ unsigned num_background; - /** Number of background requests currently queued for userspace */ - unsigned active_background; + /* + * Number of background requests currently queued for userspace. + * active_background[WRITE] for WRITE requests, and + * active_background[READ] for others. + */ + unsigned active_background[2]; - /** The list of background requests set aside for later queuing */ - struct list_head bg_queue; + /* + * The list of background requests set aside for later queuing. + * bg_queue[WRITE] for WRITE requests, bg_queue[READ] for others. + */ + struct list_head bg_queue[2]; /** Protects: max_background, congestion_threshold, num_background, * active_background, bg_queue, blocked */ @@ -867,6 +877,9 @@ struct fuse_conn { /* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */ unsigned int direct_io_allow_mmap:1; + /* separate background queue for WRITE requests and the others */ + unsigned int separate_background:1; + /* Is statx not implemented by fs? */ unsigned int no_statx:1; @@ -876,6 +889,9 @@ struct fuse_conn { /** Passthrough support for read/write IO */ unsigned int passthrough:1; + /* write reques is aligned on max_write boundary */ + unsigned int write_alignment:1; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -1074,10 +1090,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp); - /* * Initialize READ or READDIR request */ @@ -1493,4 +1505,12 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); +#ifdef CONFIG_SYSCTL +extern int fuse_sysctl_register(void); +extern void fuse_sysctl_unregister(void); +#else +#define fuse_sysctl_register() (0) +#define fuse_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d771ce993e3e0eb7dceccc9830d5d44686d4ed17..0c95552179e59126bb87b75a581894bae51b23ba 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -35,6 +35,8 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); +unsigned int fuse_max_pages_limit = 256; + unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); @@ -53,9 +55,6 @@ MODULE_PARM_DESC(max_user_congthresh, #define FUSE_DEFAULT_BLKSIZE 512 -/** Maximum number of outstanding background requests */ -#define FUSE_DEFAULT_MAX_BACKGROUND 12 - /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) @@ -935,7 +934,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); - INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->bg_queue[READ]); + INIT_LIST_HEAD(&fc->bg_queue[WRITE]); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); @@ -951,7 +951,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_pages_limit = fuse_max_pages_limit; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -1150,6 +1150,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -1347,6 +1352,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_SEPARATE_BACKGROUND) + fc->separate_background = 1; + if (flags & FUSE_WRITE_ALIGNMENT) + fc->write_alignment = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1358,6 +1369,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); + if (fc->write_alignment) { + if (fc->max_write % PAGE_SIZE) + ok = false; + else + fc->write_align_pages = fc->max_write >> PAGE_SHIFT; + } fc->conn_init = 1; } kfree(ia); @@ -1393,7 +1410,9 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | + FUSE_SEPARATE_BACKGROUND | FUSE_WRITE_ALIGNMENT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1590,6 +1609,7 @@ static int fuse_fill_super_submount(struct super_block *sb, sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; @@ -2051,8 +2071,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -2063,6 +2089,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 726640fa439e04a93277697fef4d359d8f7f6822..a6c8ee551635143bb659cd4369ec773d710a7b4d 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -8,6 +8,9 @@ #include #include #include +#include + +#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) @@ -117,6 +120,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, return 0; } +/* For fs-verity, determine iov lengths from input */ +static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov) +{ + __u16 digest_size; + struct fsverity_digest __user *uarg = (void __user *)arg; + + if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size))) + return -EFAULT; + + if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest)) + return -EINVAL; + + iov->iov_len = sizeof(struct fsverity_digest) + digest_size; + + return 0; +} + +static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, + unsigned int *in_iovs) +{ + struct fsverity_enable_arg enable; + struct fsverity_enable_arg __user *uarg = (void __user *)arg; + const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE; + + if (copy_from_user(&enable, uarg, sizeof(enable))) + return -EFAULT; + + if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len) + return -ENOMEM; + + if (enable.salt_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.salt_ptr); + iov->iov_len = enable.salt_size; + } + + if (enable.sig_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.sig_ptr); + iov->iov_len = enable.sig_size; + } + return 0; +} /* * For ioctls, there is no generic way to determine how much memory @@ -227,6 +277,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, out_iov = iov; out_iovs = 1; } + + err = 0; + switch (cmd) { + case FS_IOC_MEASURE_VERITY: + err = fuse_setup_measure_verity(arg, iov); + break; + case FS_IOC_ENABLE_VERITY: + err = fuse_setup_enable_verity(arg, iov, &in_iovs); + break; + } + if (err) + goto out; } retry: diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c new file mode 100644 index 0000000000000000000000000000000000000000..b272bb333005a8ca4e5f10b2e1a659f3f38b660a --- /dev/null +++ b/fs/fuse/sysctl.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/fuse/fuse_sysctl.c + * + * Sysctl interface to fuse parameters + */ +#include + +#include "fuse_i.h" + +static struct ctl_table_header *fuse_table_header; + +/* Bound by fuse_init_out max_pages, which is a u16 */ +static unsigned int sysctl_fuse_max_pages_limit = 65535; + +static struct ctl_table fuse_sysctl_table[] = { + { + .procname = "max_pages_limit", + .data = &fuse_max_pages_limit, + .maxlen = sizeof(fuse_max_pages_limit), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &sysctl_fuse_max_pages_limit, + }, +}; + +int fuse_sysctl_register(void) +{ + fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table); + if (!fuse_table_header) + return -ENOMEM; + return 0; +} + +void fuse_sysctl_unregister(void) +{ + unregister_sysctl_table(fuse_table_header); + fuse_table_header = NULL; +} diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index ba816b50c1d41cc488014b5ebbe2f666f1ab8d74..a6e31e68906a037505fd6a7df11feee0f7d65c37 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1083,22 +1083,13 @@ static struct virtio_driver virtio_fs_driver = { #endif }; -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link) { - struct fuse_forget_link *link; struct virtio_fs_forget *forget; struct virtio_fs_forget_req *req; - struct virtio_fs *fs; - struct virtio_fs_vq *fsvq; - u64 unique; - - link = fuse_dequeue_forget(fiq, 1, NULL); - unique = fuse_get_unique(fiq); - - fs = fiq->priv; - fsvq = &fs->vqs[VQ_HIPRIO]; - spin_unlock(&fiq->lock); + struct virtio_fs *fs = fiq->priv; + struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO]; + u64 unique = fuse_get_unique(fiq); /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); @@ -1118,8 +1109,7 @@ __releases(fiq->lock) kfree(link); } -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { /* * TODO interrupts. @@ -1128,7 +1118,6 @@ __releases(fiq->lock) * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) * with shared lock between host and guest. */ - spin_unlock(&fiq->lock); } /* Count number of scatter-gather elements required */ @@ -1333,21 +1322,17 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, return ret; } -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) { unsigned int queue_id; struct virtio_fs *fs; - struct fuse_req *req; struct virtio_fs_vq *fsvq; int ret; - WARN_ON(list_empty(&fiq->pending)); - req = list_last_entry(&fiq->pending, struct fuse_req, list); + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + clear_bit(FR_PENDING, &req->flags); - list_del_init(&req->list); - WARN_ON(!list_empty(&fiq->pending)); - spin_unlock(&fiq->lock); fs = fiq->priv; queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()]; @@ -1387,10 +1372,10 @@ __releases(fiq->lock) } static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { - .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, - .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, - .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, - .release = virtio_fs_fiq_release, + .send_forget = virtio_fs_send_forget, + .send_interrupt = virtio_fs_send_interrupt, + .send_req = virtio_fs_send_req, + .release = virtio_fs_fiq_release, }; static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 1162a47b6a42d709ac6f3c9fac01783be29d9eb5..dd1809072e6330093f5e321c13d501421cdedc56 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -215,6 +215,8 @@ * 7.40 * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag + * - add FUSE_NO_EXPORT_SUPPORT init flag + * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag */ #ifndef _LINUX_FUSE_H @@ -416,6 +418,12 @@ struct fuse_file_lock { * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. + * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support + * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit + * of the request ID indicates resend requests + * FUSE_SEPARATE_BACKGROUND: separate background queue for WRITE requests and + * the others + * FUSE_WRITE_ALIGNMENT: write request is aligned on max_write boundary */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -456,6 +464,11 @@ struct fuse_file_lock { #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) #define FUSE_PASSTHROUGH (1ULL << 37) +#define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) +#define FUSE_HAS_RESEND (1ULL << 39) +#define FUSE_WRITE_ALIGNMENT (1ULL << 55) +#define FUSE_SEPARATE_BACKGROUND (1ULL << 56) +/* The 57th bit is left to FUSE_HAS_RECOVERY */ /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP @@ -642,6 +655,7 @@ enum fuse_notify_code { FUSE_NOTIFY_STORE = 4, FUSE_NOTIFY_RETRIEVE = 5, FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_CODE_MAX, }; @@ -968,6 +982,14 @@ struct fuse_fallocate_in { uint32_t padding; }; +/** + * FUSE request unique ID flag + * + * Indicates whether this is a resend request. The receiver should handle this + * request accordingly. + */ +#define FUSE_UNIQUE_RESEND (1ULL << 63) + struct fuse_in_header { uint32_t len; uint32_t opcode;