diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30d422b8c0fc7e13bdee316a2d058eab9d9efc73..87f733193dc7f1cb23d7ea0f44a7287a6bc51323 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -105,17 +105,13 @@ void fsnotify_sb_delete(struct super_block *sb) * parent cares. Thus when an event happens on a child it can quickly tell if * if there is a need to find a parent and send the event to the parent. */ -void __fsnotify_update_child_dentry_flags(struct inode *inode) +void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; - int watched; if (!S_ISDIR(inode->i_mode)) return; - /* determine if the children should tell inode about their events */ - watched = fsnotify_inode_watches_children(inode); - spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ @@ -131,10 +127,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); - if (watched) - child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; - else - child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); @@ -142,6 +135,24 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_unlock(&inode->i_lock); } +/* + * Lazily clear false positive PARENT_WATCHED flag for child whose parent had + * stopped watching children. + */ +static void fsnotify_clear_child_dentry_flag(struct inode *pinode, + struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + /* + * d_lock is a sufficient barrier to prevent observing a non-watched + * parent state from before the fsnotify_set_children_dentry_flags() + * or fsnotify_update_flags() call that had set PARENT_WATCHED. + */ + if (!fsnotify_inode_watches_children(pinode)) + dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + spin_unlock(&dentry->d_lock); +} + /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, __u32 mask) @@ -210,7 +221,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) - __fsnotify_update_child_dentry_flags(p_inode); + fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index ff2063ec6b0f3c72f7d1c1a89cc44a46a301f98a..f7da9964909cb3206a48d6ca1afd31a3c90eab26 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -59,7 +59,7 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ -extern void __fsnotify_update_child_dentry_flags(struct inode *inode); +extern void fsnotify_set_children_dentry_flags(struct inode *inode); /* allocate and destroy and event holder to attach events to notification/access queues */ extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 5b44be5f93dd8d1e0142196f21a30b1bdb04b3e1..65dae417cabbc0f7625899da948479195fcb107f 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -132,6 +132,24 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) *fsnotify_conn_mask_p(conn) = new_mask; } +static bool fsnotify_conn_watches_children( + struct fsnotify_mark_connector *conn) +{ + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) + return false; + + return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); +} + +static void fsnotify_conn_set_children_dentry_flags( + struct fsnotify_mark_connector *conn) +{ + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) + return; + + fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); +} + /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve @@ -140,15 +158,23 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { + bool update_children; + if (!conn) return; spin_lock(&conn->lock); + update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); + update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - __fsnotify_update_child_dentry_flags( - fsnotify_conn_inode(conn)); + /* + * Set children's PARENT_WATCHED flags only if parent started watching. + * When parent stops watching, we clear false positive PARENT_WATCHED + * flags lazily in __fsnotify_parent(). + */ + if (update_children) + fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a2e42d3cd87cfaf94bc2884cacc2fa740c3896e6..ffb1bafb044ec03be55d22e8f6ecc6d6dc542a3a 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -437,12 +437,14 @@ static inline __u32 fsnotify_parent_needed_mask(__u32 mask) static inline int fsnotify_inode_watches_children(struct inode *inode) { + __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask); + /* FS_EVENT_ON_CHILD is set if the inode may care */ - if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) + if (!(parent_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ - return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; + return parent_mask & FS_EVENTS_POSS_ON_CHILD; } /* @@ -456,7 +458,7 @@ static inline void fsnotify_update_flags(struct dentry *dentry) /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken - * d_lock, the following __fsnotify_update_child_dentry_flags call will + * d_lock, the following fsnotify_set_children_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ed1be47c8cb96bbdc0a9c096fe8f5d224e1ed35..181830659484ebdd2934c4758f80d067804705a6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2792,6 +2792,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 60ff36f5d7f9e73579094a91bd8395163c7882e0..36471bbe593899f35df8b8c47a0db477d013efe8 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -857,6 +857,7 @@ struct uprobe_cpu_buffer { }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; +#define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { @@ -957,9 +958,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) - return; - if (trace_trigger_soft_disabled(trace_file)) return; @@ -1502,6 +1500,10 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); + + if (WARN_ON_ONCE(tu->tp.size + dsize > MAX_UCB_BUFFER_SIZE)) + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) @@ -1537,6 +1539,10 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); + + if (WARN_ON_ONCE(tu->tp.size + dsize > MAX_UCB_BUFFER_SIZE)) + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index 7dfa88282b006a408c7a6e77f69c17353b333301..78f081d695d0b79a52a1a49e46f851f3a3686754 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -131,6 +131,8 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; + } else { + new_node->children[0] = NULL; } } diff --git a/mm/memory.c b/mm/memory.c index ecfeda3db9ffb3e1ac10509445e68d79a8ad8992..5bf3bb445fd51e83f0f9951ef52626477fa5b6b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2295,26 +2295,13 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. - */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; - unsigned long remap_pfn = pfn; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) @@ -2344,10 +2331,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); - if (err) - return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); @@ -2359,12 +2342,57 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); + return 0; +} + +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. + */ +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); + + if (!error) + return 0; + + /* + * A partial pfn range mapping is dangerous: it does not + * maintain page reference counts, and callers may free + * pages due to the error. So zap it early. + */ + zap_page_range_single(vma, addr, size, NULL); + return error; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) - untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); + return -EINVAL; + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range);