From b1e847da034513cd022b2d8513b449c811e9f95e Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Wed, 7 Jun 2023 18:57:48 +0800 Subject: [PATCH 1/6] sched: Limit the number of task migrations per batch commit 23f0753cd22301ca69ab3033a6d885f5212353fb upstream. Put an upper limit on the number of tasks which are migrated per batch to avoid large latencies. Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f1accff51d4..d5d30c674d3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -65,7 +65,11 @@ const_debug unsigned int sysctl_sched_features = * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif #ifdef CONFIG_CFS_BANDWIDTH /* -- Gitee From 2d828b50b6fe312ee38a1c005a56764e2a82dc35 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Wed, 7 Jun 2023 18:57:59 +0800 Subject: [PATCH 2/6] sched: Move mmdrop to RCU on RT commit 232294444a772a7ddaa170533912757eda859652 upstream. Takes sleeping locks and calls into the memory allocator, so nothing we want to do in task switch and oder atomic contexts. Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 4 ++++ include/linux/sched/mm.h | 11 +++++++++++ kernel/fork.c | 13 +++++++++++++ kernel/sched/core.c | 7 ++++++- 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199b95c338a2..444f90f01687 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -585,6 +586,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5f346ed199e2..c910bcd38d87 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -50,6 +50,17 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +extern void __mmdrop_delayed(struct rcu_head *rhp); +static inline void mmdrop_delayed(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +# define mmdrop_delayed(mm) mmdrop(mm) +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/kernel/fork.c b/kernel/fork.c index a8dc422df0d1..4699c27f51be 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -729,6 +729,19 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly rcu, but we don't + * want another facility to make this work. + */ +void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} +#endif + static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5d30c674d3b..c48e5511bdde 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4315,9 +4315,13 @@ static struct rq *finish_task_switch(struct task_struct *prev) * provided by mmdrop(), * - a sync_core for SYNC_CORE. */ + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. + */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_delayed(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) @@ -7323,6 +7327,7 @@ void sched_setnuma(struct task_struct *p, int nid) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU + /* * Ensure that the idle task is using init_mm right before its CPU goes * offline. -- Gitee From 158555fa1dccefc2f1c09a542896a38243af1733 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Wed, 7 Jun 2023 18:58:12 +0800 Subject: [PATCH 3/6] kernel/sched: move stack + kprobe clean up to __put_task_struct() commit 7a9ca05e850f80923221aee31ad6f8c8cb537fa0 upstream. There is no need to free the stack before the task struct (except for reasons mentioned in commit 68f24b08ee89 ("sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK")). This also comes handy on -RT because we can't free memory in preempt disabled region. vfree_atomic() delays the memory cleanup to a worker. Since we move everything to the RCU callback, we can also free it immediately. Cc: stable-rt@vger.kernel.org #for kprobe_flush_task() Signed-off-by: Sebastian Andrzej Siewior --- kernel/fork.c | 12 +++++++++++- kernel/sched/core.c | 9 --------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 4699c27f51be..ab5865ddbfe2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -295,7 +296,7 @@ static inline void free_thread_stack(struct task_struct *tsk) return; } - vfree_atomic(tsk->stack); + vfree(tsk->stack); return; } #endif @@ -783,6 +784,15 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(tsk); + + /* Task is done with its stack. */ + put_task_stack(tsk); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c48e5511bdde..81bb1832da3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4327,15 +4327,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); - put_task_struct_rcu_user(prev); } -- Gitee From 2e87a74173031484f5ef026ac7ceb706a00cd084 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Wed, 7 Jun 2023 18:59:07 +0800 Subject: [PATCH 4/6] sched: Do not account rcu_preempt_depth on RT in might_sleep() commit 84f5415f200c8b38a012b3875d75bcddddc4c0f3 upstream. RT changes the rcu_preempt_depth semantics, so we cannot check for it in might_sleep(). Signed-off-by: Thomas Gleixner --- include/linux/rcupdate.h | 7 +++++++ kernel/sched/core.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 81da17cfcf71..1effcae06ea1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -54,6 +54,11 @@ void __rcu_read_unlock(void); * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) +#ifndef CONFIG_PREEMPT_RT +#define sched_rcu_preempt_depth() rcu_preempt_depth() +#else +static inline int sched_rcu_preempt_depth(void) { return 0; } +#endif #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -79,6 +84,8 @@ static inline int rcu_preempt_depth(void) return 0; } +#define sched_rcu_preempt_depth() rcu_preempt_depth() + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81bb1832da3a..2d8ff04cf531 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7942,7 +7942,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() + rcu_preempt_depth(); + int nested = preempt_count() + sched_rcu_preempt_depth(); return (nested == preempt_offset); } -- Gitee From 4f1b301174ea98082c5aafb860c801f40a1a6244 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Wed, 7 Jun 2023 19:02:38 +0800 Subject: [PATCH 5/6] sched: Disable TTWU_QUEUE on RT commit c4bd3e2165462311528c75c74a9f1d8c4d809040 upstream. The queued remote wakeup mechanism can introduce rather large latencies if the number of migrated tasks is high. Disable it for RT. Signed-off-by: Thomas Gleixner --- kernel/sched/features.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index b181427c6336..34980a663de9 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,11 +45,17 @@ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif + SCHED_FEAT(PREFER_TTWU_QUEUE, true) /* -- Gitee From 7e2bd45c3d2a100ec271459972df5e6901c379ce Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Wed, 7 Jun 2023 19:03:51 +0800 Subject: [PATCH 6/6] kernel/sched: add {put|get}_cpu_light() commit 0ba972dae09a2ce91cfc35c984c56b6173a7f2f6 upstream. Signed-off-by: Sebastian Andrzej Siewior --- include/linux/smp.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index 04f44e0aa2e0..cae66815f9e2 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -239,6 +239,9 @@ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() +#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) +#define put_cpu_light() migrate_enable() + /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: -- Gitee