From fe6b26587dad6e2b94c596add0977afd65ca7043 Mon Sep 17 00:00:00 2001 From: WangShuo Date: Fri, 10 Jun 2022 09:21:23 +0800 Subject: [PATCH 1/3] latency_nice Change-Id: I7d7ded0783d6cd24184ba3df3c2441dd6a5698e1 --- include/linux/sched.h | 7 ++++ include/uapi/linux/sched.h | 4 +- include/uapi/linux/sched/types.h | 15 ++++++++ init/Kconfig | 7 ++++ init/init_task.c | 1 + kernel/sched/core.c | 49 ++++++++++++++++++++++++ kernel/sched/debug.c | 1 + kernel/sched/fair.c | 64 ++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + tools/include/uapi/linux/sched.h | 4 +- 10 files changed, 151 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6ae4d7ae5a3b..3c1dc8632ec3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -538,6 +538,10 @@ struct sched_entity { unsigned long runnable_weight; #endif +#ifdef CONFIG_FAIR_LATENCY_NICE + int latency_weight; +#endif + #ifdef CONFIG_SMP /* * Per entity load average tracking. @@ -797,6 +801,9 @@ struct task_struct { int static_prio; int normal_prio; unsigned int rt_priority; +#ifdef CONFIG_FAIR_LATENCY_NICE + bool latency_nice; +#endif const struct sched_class *sched_class; struct sched_entity se; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..b2e932c25be6 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { #define SCHED_FLAG_KEEP_PARAMS 0x10 #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 +#define SCHED_FLAG_LATENCY_NICE 0x80 #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ SCHED_FLAG_KEEP_PARAMS) @@ -143,6 +144,7 @@ struct clone_args { SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) + SCHED_FLAG_UTIL_CLAMP | \ + SCHED_FLAG_LATENCY_NICE) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index c852153ddb0d..18f718c744de 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -96,6 +96,21 @@ struct sched_param { * on a CPU with a capacity big enough to fit the specified value. * A task with a max utilization value smaller than 1024 is more likely * scheduled on a CPU with no more capacity than the specified value. + * + * Latency Tolerance Attributes + * =========================== + * + * A subset of sched_attr attributes allows to specify the relative latency + * requirements of a task with respect to the other tasks running/queued in the + * system. + * + * @ sched_flags SCHED_FLAG_LATENCY_NICE indicates the task's latency_nice. + * + * The latency_nice of a task can have either true or false. + * + * A task with latency_nice with the value of LATENCY_NICE_MIN can be + * taken for a task requiring a lower latency as opposed to the task with + * higher latency_nice. */ struct sched_attr { __u32 size; diff --git a/init/Kconfig b/init/Kconfig index 2e5b9288081e..152e85bbf8c6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -880,6 +880,13 @@ config SCHED_RT_ACTIVE_LB help Check and migrate the RT process to a more suitable CPU in the tick. +config FAIR_LATENCY_NICE + bool "Latency nice for cfs scheduler" + default n + help + Adding a latency nice priority to describe the latency tolerance of + cfs tasks. + endmenu # diff --git a/init/init_task.c b/init/init_task.c index 5fa18ed59d33..ce30d58a89cc 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -78,6 +78,7 @@ struct task_struct init_task .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20, + .latency_nice = false, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, .cpus_mask = CPU_MASK_ALL, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46a0df7d1047..bc70adf89565 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -873,6 +873,11 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +static void set_latency_weight(struct task_struct *p) +{ + p->se.latency_weight = sched_latency_to_weight[p->static_prio = MAX_RT_PRIO]; +} + #ifdef CONFIG_UCLAMP_TASK /* * Serializes updates of utilization clamp values @@ -3177,6 +3182,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.latency_weight = 0; INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3348,6 +3354,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; + /* Propagate the parent's latency requirements to the child as well */ + p->latency_nice = current->latency_nice; + uclamp_fork(p); /* @@ -3370,6 +3379,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->latency_nice = false; /* * We don't need the reset flag anymore after the fork. It has @@ -3378,6 +3388,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } + /* Once latency_nice is set, update the latency weight */ + if (p->latency_nice) + set_latency_weight(p); + if (dl_prio(p->prio)) return -EAGAIN; else if (rt_prio(p->prio)) @@ -5316,6 +5330,15 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } +static void __setscheduler_latency(struct task_struct *p, + const struct sched_attr *attr) +{ + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { + p->latency_nice = true; + set_latency_weight(p); + } +} + /* * Check the target process has a UID that matches the current process's: */ @@ -5422,6 +5445,11 @@ static int __sched_setscheduler(struct task_struct *p, /* Normal users shall not reset the sched_reset_on_fork flag: */ if (p->sched_reset_on_fork && !reset_on_fork) return -EPERM; + + /* Use the same security checks as NICE */ + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && + !p->latency_nice) + return -EPERM; } if (user) { @@ -5474,6 +5502,9 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE ^ + p->latency_nice) + goto change; p->sched_reset_on_fork = reset_on_fork; retval = 0; @@ -5562,6 +5593,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); } + __setscheduler_latency(p, attr); __setscheduler_uclamp(p, attr); if (queued) { @@ -5999,6 +6031,9 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else kattr.sched_nice = task_nice(p); + if (p->latency_nice) + kattr.sched_flags |= SCHED_FLAG_LATENCY_NICE; + #ifdef CONFIG_UCLAMP_TASK /* * This could race with another potential updater, but this is fine @@ -9047,6 +9082,20 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +/* + * latency weight for wakeup preemption + */ +const int sched_latency_to_weight[40] = { + /* -20 */ 1024, 973, 922, 870, 819, + /* -15 */ 768, 717, 666, 614, 563, + /* -10 */ 512, 461, 410, 358, 307, + /* -5 */ 256, 205, 154, 102, 51, + /* 0 */ 0, -51, -102, -154, -205, + /* 5 */ -256, -307, -358, -410, -461, + /* 10 */ -512, -563, -614, -666, -717, + /* 15 */ -768, -819, -870, -922, -973, +}; + void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e5af311230be..4fffe05c8624 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1046,6 +1046,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); + P(latency_nice); if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dafc7d8d9c8f..e90b9db85556 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5579,6 +5579,34 @@ static int sched_idle_cpu(int cpu) } #endif +static void set_next_buddy(struct sched_entity *se); + +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se) +{ + struct sched_entity *next; + + if (se->latency_weight <= 0) + return; + + if (cfs->nr_running <= 1) + return; + /* + * When waking from idle, we don't need to check to preempt at wakeup + * the idle thread and don't set next buddy as a candidate for being + * picked in priority. + * In case of simultaneous wakeup from idle, the latency sensitive tasks + * lost opportunity to preempt non sensitive tasks which woke up + * simultaneously. + */ + if (cfs->next) + next = cfs->next; + else + next = __pick_first_entity(cfs); + + if (next && wakeup_preempt_entity(next, se) == 1) + set_next_buddy(se); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5668,6 +5696,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) update_overutilized_status(rq); + if (rq->curr == rq->idle) + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se); + enqueue_throttle: if (cfs_bandwidth_used()) { /* @@ -7012,6 +7043,37 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ +static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se) +{ + int latency_weight = se->latency_weight; + long thresh = sysctl_sched_latency; + + /* + * A positive latency weigth means that the sched_entity has latency + * requirement that needs to be evaluated versus other entity. + * Otherwise, use the latency weight to evaluate how much scheduling + * delay is acceptable by se. + */ + if ((se->latency_weight > 0) || (curr->latency_weight > 0)) + latency_weight -= curr->latency_weight; + + if (!latency_weight) + return 0; + + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + /* + * Clamp the delta to stay in the scheduler period range + * [-sysctl_sched_latency:sysctl_sched_latency] + */ + latency_weight = clamp_t(long, latency_weight, + sched_latency_to_weight[NICE_WIDTH], + sched_latency_to_weight[0]); + + return (thresh * latency_weight) >> SCHED_FIXEDPOINT_SHIFT; +} + static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -7051,6 +7113,8 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime; + vdiff += wakeup_latency_gran(curr, se); + if (vdiff <= 0) return -1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e4c65d96185e..31f71b7c0433 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1841,6 +1841,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) extern const int sched_prio_to_weight[40]; extern const u32 sched_prio_to_wmult[40]; +extern const int sched_latency_to_weight[40]; /* * {de,en}queue flags: diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h index 3bac0a8ceab2..b2e932c25be6 100644 --- a/tools/include/uapi/linux/sched.h +++ b/tools/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { #define SCHED_FLAG_KEEP_PARAMS 0x10 #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 +#define SCHED_FLAG_LATENCY_NICE 0x80 #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ SCHED_FLAG_KEEP_PARAMS) @@ -143,6 +144,7 @@ struct clone_args { SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) + SCHED_FLAG_UTIL_CLAMP | \ + SCHED_FLAG_LATENCY_NICE) #endif /* _UAPI_LINUX_SCHED_H */ -- Gitee From 4e5b8c900997988671fe2f4cd5fdbb81d5511dd0 Mon Sep 17 00:00:00 2001 From: chiliren Date: Fri, 24 Jun 2022 12:41:07 +0800 Subject: [PATCH 2/3] umcg --- arch/arm/tools/syscall.tbl | 1 + arch/arm64/Kconfig | 1 + arch/arm64/include/asm/thread_info.h | 11 + arch/arm64/include/asm/unistd.h | 5 +- arch/arm64/include/asm/unistd32.h | 2 + arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/ptrace.c | 6 + arch/arm64/kernel/signal.c | 2 + arch/arm64/kernel/syscall.c | 6 +- arch/arm64/mm/fault.c | 41 ++- fs/exec.c | 1 + include/linux/sched.h | 79 +++++ include/uapi/linux/umcg.h | 59 ++++ init/Kconfig | 14 + kernel/exit.c | 4 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 8 +- kernel/sched/sched.h | 3 + kernel/sched/umcg.c | 446 +++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + mm/migrate.c | 9 +- mm/mprotect.c | 6 + 22 files changed, 702 insertions(+), 8 deletions(-) create mode 100644 include/uapi/linux/umcg.h create mode 100644 kernel/sched/umcg.c diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index d056a548358e..51a7cff4d60e 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -454,3 +454,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +451 common umcg_ctl sys_umcg_ctl diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 09c41d56fe78..ef0b3d46f6c4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -202,6 +202,7 @@ config ARM64 select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select HAVE_UMCG help ARM 64-bit (AArch64) Linux support. diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 1fbab854a51b..689168c46b9e 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -68,6 +68,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ #define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ +#define TIF_UMCG 7 /* UMCG return to user hook */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -98,15 +99,25 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) +#define _TIF_UMCG (1 << TIF_UMCG) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT) +#define _TIF_WORK_MASK_UMCG (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT | _TIF_UMCG) + #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#define _TIF_SYSCALL_WORK_UMCG (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ + _TIF_SYSCALL_EMU | _TIF_UMCG) + + #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index b3b2019f8d16..b85bd4084492 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -29,6 +29,9 @@ #define __NR_compat_clock_getres 264 #define __NR_compat_clock_gettime64 403 #define __NR_compat_clock_getres_time64 406 +#define __NR_umcg_ctl 450 +#define __NR_umcg_wait 451 +#define __NR_umcg_kick 452 /* * The following SVCs are ARM private. @@ -38,7 +41,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 441 +#define __NR_compat_syscalls 453 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 107f08e03b9f..a9c6f02c73e3 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_umcg_ctl 450 +__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d5bc1dbdd2fd..4e826059e6d0 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -751,7 +751,7 @@ SYM_CODE_START_LOCAL(ret_to_user) bl trace_hardirqs_off #endif ldr x19, [tsk, #TSK_TI_FLAGS] - and x2, x19, #_TIF_WORK_MASK + and x2, x19, #_TIF_WORK_MASK_UMCG cbnz x2, work_pending finish_ret_to_user: user_enter_irqoff diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 2817e39881fe..b93825402a4d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1810,6 +1810,9 @@ int syscall_trace_enter(struct pt_regs *regs) if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, regs->syscallno); + if (test_thread_flag(TIF_UMCG)) + umcg_sys_enter(regs, regs->syscallno); + audit_syscall_entry(regs->syscallno, regs->orig_x0, regs->regs[1], regs->regs[2], regs->regs[3]); @@ -1828,6 +1831,9 @@ void syscall_trace_exit(struct pt_regs *regs) if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); + if (flags & _TIF_UMCG) + umcg_sys_exit(regs); + rseq_syscall(regs); } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index e62005317ce2..37b4aa067e23 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -939,6 +939,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, if (thread_flags & _TIF_SIGPENDING) do_signal(regs); + if (thread_flags & _TIF_UMCG) + umcg_notify_resume(regs); if (thread_flags & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index befde0eaa5e7..d42536d723f8 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -55,7 +55,11 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, static inline bool has_syscall_work(unsigned long flags) { - return unlikely(flags & _TIF_SYSCALL_WORK); + if (current->flags & PF_UMCG_WORKER) { + return unlikely(flags & _TIF_SYSCALL_WORK_UMCG); + } else { + return unlikely(flags & _TIF_SYSCALL_WORK); + } } int syscall_trace_enter(struct pt_regs *regs); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 795d224f184f..6a3e36c27be6 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -448,6 +448,29 @@ static bool is_write_abort(unsigned int esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } +/** + * irqentry_irq_enable - Conditionally enable IRQs from exceptions + * + * Common code for exceptions to (re)enable IRQs, typically done to allow + * from-user exceptions to schedule (since they run on the task stack). + */ +static inline void irqentry_irq_enable(struct pt_regs *regs) +{ + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) + umcg_sys_enter(regs, -1); +} + +/** + * irqentry_irq_disable - Conditionally disable IRQs from exceptions + * + * Counterpart of irqentry_irq_enable(). +*/ +static inline void irqentry_irq_disable(struct pt_regs *regs) +{ + if (user_mode(regs) && (current->flags & PF_UMCG_WORKER)) + umcg_sys_exit(regs); +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -456,6 +479,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; + bool umcgFlag = false; if (kprobe_page_fault(regs, esr)) return 0; @@ -470,6 +494,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; + irqentry_irq_enable(regs); + umcgFlag = true; + if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; @@ -525,6 +552,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) goto no_context; + if (umcgFlag) + irqentry_irq_disable(regs); return 0; } @@ -540,8 +569,11 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, * Handle the "normal" (no error) case first. */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | - VM_FAULT_BADACCESS)))) + VM_FAULT_BADACCESS)))) { + if (umcgFlag) + irqentry_irq_disable(regs); return 0; + } /* * If we are in kernel mode at this point, we have no context to @@ -557,6 +589,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, * oom-killed). */ pagefault_out_of_memory(); + if (umcgFlag) + irqentry_irq_disable(regs); return 0; } @@ -589,10 +623,15 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, inf->name); } + if (umcgFlag) + irqentry_irq_disable(regs); + return 0; no_context: __do_kernel_fault(addr, esr, regs); + if (umcgFlag) + irqentry_irq_disable(regs); return 0; } diff --git a/fs/exec.c b/fs/exec.c index 72f8763b3ce9..6cbed4e71590 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1831,6 +1831,7 @@ static int bprm_execve(struct linux_binprm *bprm, current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); + umcg_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c1dc8632ec3..e1be61ba2091 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -69,6 +69,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct io_uring_task; +struct umcg_task; /* * Task state bitmask. NOTE! These bits are also @@ -1312,6 +1313,19 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_UMCG + /* setup by sys_umcg_ctrl() */ + u32 umcg_flags; + struct umcg_task __user *umcg_task; + + /* setup by umcg_pin_enter() */ + struct page *umcg_page; + + struct task_struct *umcg_server; + struct umcg_task __user *umcg_server_task; + struct page *umcg_server_page; +#endif + struct tlbflush_unmap_batch tlb_ubc; union { @@ -1691,6 +1705,13 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ + +#ifdef CONFIG_UMCG +#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */ +#else +#define PF_UMCG_WORKER 0x00000000 +#endif + #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ @@ -2202,6 +2223,64 @@ static inline void rseq_execve(struct task_struct *t) #endif +#ifdef CONFIG_UMCG +extern void umcg_sys_enter(struct pt_regs *regs, long syscall); +extern void umcg_sys_exit(struct pt_regs *regs); +extern void umcg_worker_exit(void); +extern void umcg_clear_child(struct task_struct *tsk); +extern void umcg_notify_resume(struct pt_regs *regs); + +/* Called by bprm_execve() in fs/exec.c. */ +static inline void umcg_execve(struct task_struct *tsk) +{ + if (tsk->umcg_task) + umcg_clear_child(tsk); +} + +/* Called by do_exit() in kernel/exit.c */ +static inline void umcg_handle_exit(void) +{ + if (current->flags & PF_UMCG_WORKER) + umcg_worker_exit(); +} + +/* + *umcg_wq_worker_[sleeping|running] are called in core.c by + *sched_submit_work() and sched_update_worker(). + */ +extern void umcg_wq_worker_sleeping(struct task_struct *tsk); + +#else /* CONFIG_UMCG */ + +static inline void umcg_sys_enter(struct pt_regs *regs, long syscall) +{ +} + +static inline void umcg_sys_exit(struct pt_regs *regs) +{ +} + +static inline void umcg_clear_child(struct task_struct *tsk) +{ +} + +static inline void umcg_wq_worker_sleeping(struct task_struct *tsk) +{ +} + +static inline void umcg_execve(struct task_struct *tsk) +{ +} + +static inline void umcg_handle_exit(void) +{ +} +static inline void umcg_notify_resume(struct pt_regs *regs) +{ +} + +#endif + #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h new file mode 100644 index 000000000000..9d6cba42094b --- /dev/null +++ b/include/uapi/linux/umcg.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_UMCG_H +#define _UAPI_LINUX_UMCG_H + +#include + +/* + * UMCG: User Managed Concurrency Groups. + * + * Syscalls (see kernel/sched/umcg.c): + * sys_umcg_ctl() - register/unregister UMCG tasks; + * + * struct umcg_task (below): controls the state of UMCG tasks. + */ + +#define UMCG_TASK_MASK 0x00ffU + +#define UMCG_TASK_ALIGN 64 + +#define UMCG_TID_MASK 0x3fffffffU + +/** + * struct umcg_task - controls the state of UMCG tasks. + * + * The struct is aligned at 64 bytes to ensure that it fits into + * a single cache line. + */ +struct umcg_task { + __u32 server_tid; /* r */ + + /** + * @workers_sum: count the number of workers which is bound with server + * Read-only for the userspace + */ + __u32 workers_sum; /* r */ + + /** + * @blocked_workers_cnt: count the number of blocked workers + * + * Read-only for the userspace + */ + __u32 blocked_workers_cnt; /* r */ + + __u32 __zero[3]; +} __attribute__((packed, aligned(UMCG_TASK_ALIGN))); + +/** + * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl + * @UMCG_CTL_REGISTER: register the current task as a UMCG task + * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task + * @UMCG_CTL_WORKER: register the current task as a UMCG worker + */ +enum umcg_ctl_flag { + UMCG_CTL_REGISTER = 0x00001, + UMCG_CTL_UNREGISTER = 0x00002, + UMCG_CTL_WORKER = 0x10000, +}; + +#endif /* _UAPI_LINUX_UMCG_H */ diff --git a/init/Kconfig b/init/Kconfig index 152e85bbf8c6..09a57c93cafe 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1698,6 +1698,20 @@ config MEMBARRIER If unsure, say Y. +config HAVE_UMCG + bool + +config UMCG + bool "Enable User Managed Concurrency Groups API" + depends on 64BIT + depends on HAVE_UMCG + default n + help + Enable User Managed Concurrency Groups API, which form the basis + for an in-process M:N userspace scheduling framework. + At the moment this is an experimental/RFC feature that is not + guaranteed to be backward-compatible. + config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT default y diff --git a/kernel/exit.c b/kernel/exit.c index 795e16ecc422..36ba2c72d1fb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -728,6 +728,9 @@ void __noreturn do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* Turn off UMCG sched hooks. */ + if (unlikely(tsk->flags & PF_UMCG_WORKER)) + tsk->flags &= ~PF_UMCG_WORKER; /* * If do_exit is called because this processes oopsed, it's possible @@ -766,6 +769,7 @@ void __noreturn do_exit(long code) io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ sched_exit(tsk); + umcg_handle_exit(); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1b4834073ae7..865ff3c21000 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o +obj-$(CONFIG_UMCG) += umcg.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc70adf89565..31a6cc2c5950 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2932,8 +2932,7 @@ static inline void walt_try_to_wake_up(struct task_struct *p) * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; @@ -3219,6 +3218,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHED_RTG p->rtg_depth = 0; #endif + umcg_clear_child(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4722,10 +4722,12 @@ static inline void sched_submit_work(struct task_struct *tsk) * in the possible wakeup of a kworker and because wq_worker_sleeping() * requires it. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_WQ_WORKER)) { preempt_disable(); if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); + else if (task_flags & PF_UMCG_WORKER) + umcg_wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31f71b7c0433..53292a08a737 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1826,6 +1826,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ +#define WF_CURRENT_CPU 0x80 /* Prefer to move the wakee to the current CPU. */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -2794,6 +2795,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + #ifdef CONFIG_SCHED_RTG extern bool task_fits_max(struct task_struct *p, int cpu); extern unsigned long capacity_spare_without(int cpu, struct task_struct *p); diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c new file mode 100644 index 000000000000..50aade21560a --- /dev/null +++ b/kernel/sched/umcg.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * User Managed Concurrency Groups (UMCG). + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +static struct task_struct *umcg_get_task(u32 tid) +{ + struct task_struct *tsk = NULL; + + if (tid) { + rcu_read_lock(); + tsk = find_task_by_vpid(tid & UMCG_TID_MASK); + if (tsk && current->mm == tsk->mm && tsk->umcg_task) + get_task_struct(tsk); + else + tsk = NULL; + rcu_read_unlock(); + } + + return tsk; +} + +/* + * Pinning a page inhibits rmap based unmap for Anon pages. Doing a store + * through the user mapping ensures the user mapping exists and is writable. + */ +static int umcg_pin_page(struct umcg_task __user *self, struct page **pagep) +{ + int ret = -EFAULT; + + if (pin_user_pages_fast((unsigned long)self, 1, FOLL_WRITE, pagep) != 1) + goto out; + + if (!PageAnon(*pagep) || + put_user(0ULL, &self->__zero[0])) { + unpin_user_page(*pagep); + goto out; + } + + ret = 0; +out: + return ret; +} + +/** + * umcg_pin_pages: pin pages containing struct umcg_task of + * this task and its server (possibly this task again). + */ +static int umcg_pin_pages(void) +{ + struct task_struct *server = NULL, *tsk = current; + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task); + int server_tid; + int ret; + + /* must not have stale state */ + if (WARN_ON_ONCE(tsk->umcg_page || + tsk->umcg_server_page || + tsk->umcg_server_task || + tsk->umcg_server)) + return -EBUSY; + + ret = umcg_pin_page(self, &tsk->umcg_page); + if (ret) + goto clear_self; + + if (get_user(server_tid, &self->server_tid)) + goto unpin_self; + + ret = -ESRCH; + server = umcg_get_task(server_tid); + if (!server) + goto unpin_self; + + /* must cache due to possible concurrent change */ + tsk->umcg_server_task = READ_ONCE(server->umcg_task); + ret = umcg_pin_page(tsk->umcg_server_task, &tsk->umcg_server_page); + if (ret) + goto clear_server; + + tsk->umcg_server = server; + + return 0; + +clear_server: + tsk->umcg_server_task = NULL; + tsk->umcg_server_page = NULL; + +unpin_self: + unpin_user_page(tsk->umcg_page); +clear_self: + tsk->umcg_page = NULL; + + return ret; +} + +static void umcg_unpin_pages(void) +{ + struct task_struct *tsk = current; + + if (tsk->umcg_server) { + unpin_user_page(tsk->umcg_page); + tsk->umcg_page = NULL; + + unpin_user_page(tsk->umcg_server_page); + tsk->umcg_server_page = NULL; + tsk->umcg_server_task = NULL; + + put_task_struct(tsk->umcg_server); + tsk->umcg_server = NULL; + } +} + +static void umcg_clear_task(struct task_struct *tsk) +{ + /* + * This is either called for the current task, or for a newly forked + * task that is not yet running, so we don't need strict atomicity + * below. + */ + if (tsk->umcg_task) { + WRITE_ONCE(tsk->umcg_task, NULL); + + tsk->flags &= ~PF_UMCG_WORKER; + clear_tsk_thread_flag(tsk, TIF_UMCG); + } + + tsk->umcg_page = NULL; + + tsk->umcg_server = NULL; + tsk->umcg_server_page = NULL; + tsk->umcg_server_task = NULL; +} + +/* Called for a forked or execve-ed child. */ +void umcg_clear_child(struct task_struct *tsk) +{ + umcg_clear_task(tsk); +} + +/* Called both by normally (unregister) and abnormally exiting workers. */ +void umcg_worker_exit(void) +{ + umcg_unpin_pages(); + umcg_clear_task(current); +} + +#define __UMCG_DIE(stmt, reason) do { \ + stmt; \ + pr_warn_ratelimited("%s: killing task %s/%d because: " reason "\n",\ + __func__, current->comm, current->pid); \ + force_sig(SIGKILL); \ + return; \ +} while (0) + +#define UMCG_DIE(reason) __UMCG_DIE(,reason) +#define UMCG_DIE_PF(reason) __UMCG_DIE(pagefault_enable(), reason) + +/* Called from syscall enter path and exceptions that can schedule */ +void umcg_sys_enter(struct pt_regs *regs, long syscall) +{ + /* avoid recursion vs our own syscalls */ + if (syscall == __NR_umcg_ctl) + return; + + /* avoid recursion vs schedule() */ + current->flags &= ~PF_UMCG_WORKER; + + /* + * Pin all the state on sys_enter() such that we can rely on it + * from dodgy contexts. It is either unpinned from pre-schedule() + * or sys_exit(), whichever comes first, thereby ensuring the pin + * is temporary. + */ + if (umcg_pin_pages()) + UMCG_DIE("pin"); + + current->flags |= PF_UMCG_WORKER; +} + +static int umcg_upd_workers_sum(struct task_struct *server, bool registed) +{ + struct umcg_task __user *self = server->umcg_task; + u32 sum; + + if (!user_access_begin(self, sizeof(*self))) + return -EFAULT; + + sum = self->workers_sum; + if (registed) + unsafe_put_user(++sum, &self->workers_sum, Efault); + else + unsafe_put_user(--sum, &self->workers_sum, Efault); + + user_access_end(); + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +static int umcg_upd_blk_workers_cnt(struct task_struct *tsk, bool blocked) +{ + struct umcg_task __user *server = tsk->umcg_server_task; + struct umcg_task __user *self = tsk->umcg_task; + u32 sum; + + /* + * umcg_pin_pages() did access_ok() on both pointers, use self here + * only because __user_access_begin() isn't available in generic code. + */ + if (!user_access_begin(self, sizeof(*self))) + return -EFAULT; + + sum = server->blocked_workers_cnt; + if (blocked) + unsafe_put_user(++sum, &server->blocked_workers_cnt, Efault); + else + unsafe_put_user(--sum, &server->blocked_workers_cnt, Efault); + + user_access_end(); + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +/* pre-schedule() */ +void umcg_wq_worker_sleeping(struct task_struct *tsk) +{ + if (!tsk->umcg_server || tsk->state) { + /* + * Either this task blocked before, or SYSCALL_UMCG is + * (temporarily) disabled (see umcg_notify_resume()). Either + * way the pages are unpinned and there's nothing to do. + */ + return; + } + + /* Must not fault, mmap_sem might be held. */ + pagefault_disable(); + + if(umcg_upd_blk_workers_cnt(tsk, true)) + UMCG_DIE_PF("upd_cnt"); + + pagefault_enable(); + + /* + * We're going to sleep, make sure to unpin the pages, this ensures + * the pins are temporary. Also see umcg_sys_exit(). + */ + umcg_unpin_pages(); +} + +/* Called from syscall exit path and exceptions that can schedule */ +void umcg_sys_exit(struct pt_regs *regs) +{ + struct task_struct *tsk = current; + long syscall = syscall_get_nr(tsk, regs); + + if (syscall == __NR_umcg_ctl) + return; + + if (tsk->umcg_server) { + /* + * Didn't block, we done. + */ + umcg_unpin_pages(); + return; + } +} + +/* return-to-user path */ +void umcg_notify_resume(struct pt_regs *regs) +{ + struct task_struct *tsk = current; + bool worker = tsk->flags & PF_UMCG_WORKER; + + if (!worker) + return; + + current->flags &= ~PF_UMCG_WORKER; + + if (umcg_pin_pages()) + UMCG_DIE("pin"); + + umcg_upd_blk_workers_cnt(tsk, false); + + umcg_unpin_pages(); + + current->flags |= PF_UMCG_WORKER; +} + +static int umcg_register(struct umcg_task __user *self, u32 flags) +{ + struct task_struct *server; + struct umcg_task ut; + + current->umcg_flags = flags; + + if (current->umcg_task || !self) + return -EINVAL; + + if (copy_from_user(&ut, self, sizeof(ut))) + return -EFAULT; + + rcu_read_lock(); + server = find_task_by_vpid(ut.server_tid); + if (server && server->mm == current->mm) { + if (flags == UMCG_CTL_WORKER) { + if (!server->umcg_task || (server->flags & PF_UMCG_WORKER)) + server = NULL; + } else { + if (server != current) + server = NULL; + } + } else { + server = NULL; + } + rcu_read_unlock(); + + if (!server) + return -ESRCH; + + if (flags == UMCG_CTL_WORKER) { + umcg_upd_workers_sum(server, true); + WRITE_ONCE(current->umcg_task, self); + current->flags |= PF_UMCG_WORKER; /* hook schedule() */ + set_thread_flag(TIF_UMCG); /* hook return-to-user */ + } else { + self->workers_sum = 0U; + WRITE_ONCE(current->umcg_task, self); + set_thread_flag(TIF_UMCG); /* hook return-to-user */ + } + + return 0; +} + +static int umcg_unregister(struct umcg_task __user *self, u32 flags) +{ + bool worker = current->flags & PF_UMCG_WORKER; + int ret; + + if (!self || self != current->umcg_task) + return -EINVAL; + + if (!worker != !(flags & UMCG_CTL_WORKER)) + return -EINVAL; + + current->flags &= ~PF_UMCG_WORKER; + + ret = umcg_pin_pages(); + if (ret) { + if (worker) + current->flags |= PF_UMCG_WORKER; + return ret; + } + + ret = umcg_upd_workers_sum(current->umcg_server, false); + if (ret) + return ret; + + umcg_unpin_pages(); + umcg_clear_task(current); + return 0; +} + +#define UMCG_CTL_CMD 0xff + +/** + * sys_umcg_ctl: (un)register the current task as a UMCG task. + * @flags: ORed values from enum umcg_ctl_flag; see below; + * @self: a pointer to struct umcg_task that describes this + * task. + * + * @flags & UMCG_CTL_REGISTER: register a UMCG task: + * + * UMCG workers: + * - @flags & UMCG_CTL_WORKER + * + * UMCG servers: + * - !(@flags & UMCG_CTL_WORKER) + * + * All tasks: + * - self->server_tid must be a valid server + * + * If the conditions above are met, sys_umcg_ctl() immediately returns + * if the registered task is a server. If the registered task is a + * worker, it's server's workers_sum will be added. + * + * @flags & UMCG_CTL_UNREGISTER: unregister a UMCG task. + * + * UMCG workers: + * - @flags & UMCG_CTL_WORKER + * + * UMCG servers: + * - !(@flags & UMCG_CTL_WORKER) + * + * All tasks: + * - self must match with UMCG_CTL_REGISTER + * + * Return: + * 0 - success + * -EFAULT - failed to read @self + * -EINVAL - some other error occurred + * -ESRCH - no such server_tid + */ +SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self) +{ + int cmd = flags & UMCG_CTL_CMD; + + if ((unsigned long)self % UMCG_TASK_ALIGN) + return -EINVAL; + + flags &= ~UMCG_CTL_CMD; + + if (flags & ~UMCG_CTL_WORKER) + return -EINVAL; + + switch (cmd) { + case UMCG_CTL_REGISTER: + return umcg_register(self, flags); + + case UMCG_CTL_UNREGISTER: + return umcg_unregister(self, flags); + + default: + break; + } + + return -EINVAL; +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f27ac94d5fa7..5dbf0f9aabc4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -264,6 +264,9 @@ COND_SYSCALL(request_key); COND_SYSCALL(keyctl); COND_SYSCALL_COMPAT(keyctl); +/* kernel/sched/umcg.c */ +COND_SYSCALL(umcg_ctl); + /* arch/example/kernel/sys_example.c */ /* mm/fadvise.c */ diff --git a/mm/migrate.c b/mm/migrate.c index 278e6f3fa62c..2dfaf0f832e9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1449,7 +1449,14 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, nr_subpages = thp_nr_pages(page); cond_resched(); - if (PageHuge(page)) + /* + * If the page has a pin then expected_page_refs() will + * not match and the whole migration will fail later + * anyway, fail early and preserve the mappings. + */ + if (page_maybe_dma_pinned(page)) + rc = -EAGAIN; + else if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, pass > 2, mode, reason); diff --git a/mm/mprotect.c b/mm/mprotect.c index 53b6b1b8fb67..d6934ccc4165 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,6 +105,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (page_is_file_lru(page) && PageDirty(page)) continue; + /* + * Can't migarate pinned pages, avoid touching them. + */ + if (page_maybe_dma_pinned(page)) + continue; + /* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. -- Gitee From bb488204e0cd61cbc3f2515d4033967fb450fe38 Mon Sep 17 00:00:00 2001 From: chiliren Date: Fri, 24 Jun 2022 12:45:22 +0800 Subject: [PATCH 3/3] del wait & kick --- arch/arm64/include/asm/unistd.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index b85bd4084492..70644d0b6168 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -30,8 +30,6 @@ #define __NR_compat_clock_gettime64 403 #define __NR_compat_clock_getres_time64 406 #define __NR_umcg_ctl 450 -#define __NR_umcg_wait 451 -#define __NR_umcg_kick 452 /* * The following SVCs are ARM private. -- Gitee