diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199b95c338a25933c05cff57f306f1ecd515b519..444f90f016874bdc1f2e6e8db94431d0b794de34 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -585,6 +586,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 81da17cfcf71436d27bd6ebe668beb87a0d0eed2..1effcae06ea1ba4302e2031f2acf92eb98d0a70d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -54,6 +54,11 @@ void __rcu_read_unlock(void); * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) +#ifndef CONFIG_PREEMPT_RT +#define sched_rcu_preempt_depth() rcu_preempt_depth() +#else +static inline int sched_rcu_preempt_depth(void) { return 0; } +#endif #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -79,6 +84,8 @@ static inline int rcu_preempt_depth(void) return 0; } +#define sched_rcu_preempt_depth() rcu_preempt_depth() + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5f346ed199e22f39b99b410f60b86496278de62f..c910bcd38d87634d2713783745edb6d25bb2a2af 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -50,6 +50,17 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +extern void __mmdrop_delayed(struct rcu_head *rhp); +static inline void mmdrop_delayed(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +# define mmdrop_delayed(mm) mmdrop(mm) +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/include/linux/smp.h b/include/linux/smp.h index 04f44e0aa2e0bda1a0fa3c665d3511c0d4c8d531..cae66815f9e209b43c47b04bd07a7d3cac5724d2 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -239,6 +239,9 @@ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() +#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) +#define put_cpu_light() migrate_enable() + /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: diff --git a/kernel/fork.c b/kernel/fork.c index a8dc422df0d1bff07ff3134c26dfbeeb88ef43f0..ab5865ddbfe2e5b53b7710b4b3f66e4511a00ec2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -295,7 +296,7 @@ static inline void free_thread_stack(struct task_struct *tsk) return; } - vfree_atomic(tsk->stack); + vfree(tsk->stack); return; } #endif @@ -729,6 +730,19 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly rcu, but we don't + * want another facility to make this work. + */ +void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} +#endif + static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; @@ -770,6 +784,15 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(tsk); + + /* Task is done with its stack. */ + put_task_stack(tsk); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f1accff51d43f244b31a55558779c2124fe81b4..2d8ff04cf53199451e268ccbe43932b07303b251 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -65,7 +65,11 @@ const_debug unsigned int sysctl_sched_features = * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -4311,23 +4315,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) * provided by mmdrop(), * - a sync_core for SYNC_CORE. */ + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. + */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_delayed(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); - put_task_struct_rcu_user(prev); } @@ -7319,6 +7318,7 @@ void sched_setnuma(struct task_struct *p, int nid) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU + /* * Ensure that the idle task is using init_mm right before its CPU goes * offline. @@ -7942,7 +7942,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() + rcu_preempt_depth(); + int nested = preempt_count() + sched_rcu_preempt_depth(); return (nested == preempt_offset); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index b181427c6336d2939ad58dc7ee39b45452726d38..34980a663de9a2d1b5c3ce146e058ce4b89152c8 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,11 +45,17 @@ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif + SCHED_FEAT(PREFER_TTWU_QUEUE, true) /*