From fe6b26587dad6e2b94c596add0977afd65ca7043 Mon Sep 17 00:00:00 2001
From: WangShuo <wsh8880@163.com>
Date: Fri, 10 Jun 2022 09:21:23 +0800
Subject: [PATCH 1/3] latency_nice

Change-Id: I7d7ded0783d6cd24184ba3df3c2441dd6a5698e1
---
 include/linux/sched.h            |  7 ++++
 include/uapi/linux/sched.h       |  4 +-
 include/uapi/linux/sched/types.h | 15 ++++++++
 init/Kconfig                     |  7 ++++
 init/init_task.c                 |  1 +
 kernel/sched/core.c              | 49 ++++++++++++++++++++++++
 kernel/sched/debug.c             |  1 +
 kernel/sched/fair.c              | 64 ++++++++++++++++++++++++++++++++
 kernel/sched/sched.h             |  1 +
 tools/include/uapi/linux/sched.h |  4 +-
 10 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6ae4d7ae5a3b..3c1dc8632ec3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -538,6 +538,10 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif
 
+#ifdef CONFIG_FAIR_LATENCY_NICE
+	int				latency_weight;
+#endif
+
 #ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
@@ -797,6 +801,9 @@ struct task_struct {
 	int				static_prio;
 	int				normal_prio;
 	unsigned int			rt_priority;
+#ifdef CONFIG_FAIR_LATENCY_NICE
+	bool				latency_nice;
+#endif
 
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..b2e932c25be6 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
 #define SCHED_FLAG_KEEP_PARAMS		0x10
 #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
 #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
 
 #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
 				 SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
 			 SCHED_FLAG_RECLAIM		| \
 			 SCHED_FLAG_DL_OVERRUN		| \
 			 SCHED_FLAG_KEEP_ALL		| \
-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index c852153ddb0d..18f718c744de 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -96,6 +96,21 @@ struct sched_param {
  * on a CPU with a capacity big enough to fit the specified value.
  * A task with a max utilization value smaller than 1024 is more likely
  * scheduled on a CPU with no more capacity than the specified value.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_flags	SCHED_FLAG_LATENCY_NICE indicates the task's latency_nice.
+ *
+ * The latency_nice of a task can have either true or false.
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task requiring a lower latency as opposed to the task with
+ * higher latency_nice.
  */
 struct sched_attr {
 	__u32 size;
diff --git a/init/Kconfig b/init/Kconfig
index 2e5b9288081e..152e85bbf8c6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -880,6 +880,13 @@ config SCHED_RT_ACTIVE_LB
 	help
 	  Check and migrate the RT process to a more suitable CPU in the tick.
 
+config FAIR_LATENCY_NICE
+	bool "Latency nice for cfs scheduler"
+	default n
+	help
+	  Adding a latency nice priority to describe the latency tolerance of
+	  cfs tasks.
+
 endmenu
 
 #
diff --git a/init/init_task.c b/init/init_task.c
index 5fa18ed59d33..ce30d58a89cc 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,6 +78,7 @@ struct task_struct init_task
 	.prio		= MAX_PRIO - 20,
 	.static_prio	= MAX_PRIO - 20,
 	.normal_prio	= MAX_PRIO - 20,
+	.latency_nice	= false,
 	.policy		= SCHED_NORMAL,
 	.cpus_ptr	= &init_task.cpus_mask,
 	.cpus_mask	= CPU_MASK_ALL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 46a0df7d1047..bc70adf89565 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -873,6 +873,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	}
 }
 
+static void set_latency_weight(struct task_struct *p)
+{
+	p->se.latency_weight = sched_latency_to_weight[p->static_prio = MAX_RT_PRIO];
+}
+
 #ifdef CONFIG_UCLAMP_TASK
 /*
  * Serializes updates of utilization clamp values
@@ -3177,6 +3182,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+	p->se.latency_weight		= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3348,6 +3354,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 */
 	p->prio = current->normal_prio;
 
+	/* Propagate the parent's latency requirements to the child as well */
+	p->latency_nice = current->latency_nice;
+
 	uclamp_fork(p);
 
 	/*
@@ -3370,6 +3379,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 		p->prio = p->normal_prio = p->static_prio;
 		set_load_weight(p, false);
+		p->latency_nice = false;
 
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -3378,6 +3388,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_reset_on_fork = 0;
 	}
 
+	/* Once latency_nice is set, update the latency weight */
+	if (p->latency_nice)
+		set_latency_weight(p);
+
 	if (dl_prio(p->prio))
 		return -EAGAIN;
 	else if (rt_prio(p->prio))
@@ -5316,6 +5330,15 @@ static void __setscheduler_params(struct task_struct *p,
 	set_load_weight(p, true);
 }
 
+static void __setscheduler_latency(struct task_struct *p,
+		const struct sched_attr *attr)
+{
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+		p->latency_nice = true;
+		set_latency_weight(p);
+	}
+}
+
 /*
  * Check the target process has a UID that matches the current process's:
  */
@@ -5422,6 +5445,11 @@ static int __sched_setscheduler(struct task_struct *p,
 		/* Normal users shall not reset the sched_reset_on_fork flag: */
 		if (p->sched_reset_on_fork && !reset_on_fork)
 			return -EPERM;
+
+		/* Use the same security checks as NICE */
+		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+		    !p->latency_nice)
+			return -EPERM;
 	}
 
 	if (user) {
@@ -5474,6 +5502,9 @@ static int __sched_setscheduler(struct task_struct *p,
 			goto change;
 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE ^
+		    p->latency_nice)
+			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
 		retval = 0;
@@ -5562,6 +5593,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		__setscheduler_params(p, attr);
 		__setscheduler_prio(p, newprio);
 	}
+	__setscheduler_latency(p, attr);
 	__setscheduler_uclamp(p, attr);
 
 	if (queued) {
@@ -5999,6 +6031,9 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	else
 		kattr.sched_nice = task_nice(p);
 
+	if (p->latency_nice)
+		kattr.sched_flags |= SCHED_FLAG_LATENCY_NICE;
+
 #ifdef CONFIG_UCLAMP_TASK
 	/*
 	 * This could race with another potential updater, but this is fine
@@ -9047,6 +9082,20 @@ const u32 sched_prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
+/*
+ * latency weight for wakeup preemption
+ */
+const int sched_latency_to_weight[40] = {
+ /* -20 */      1024,       973,       922,       870,       819,
+ /* -15 */       768,       717,       666,       614,       563,
+ /* -10 */       512,       461,       410,       358,       307,
+ /*  -5 */       256,       205,       154,       102,        51,
+ /*   0 */         0,       -51,      -102,      -154,      -205,
+ /*   5 */      -256,      -307,      -358,      -410,      -461,
+ /*  10 */      -512,      -563,      -614,      -666,      -717,
+ /*  15 */      -768,      -819,      -870,      -922,      -973,
+};
+
 void call_trace_sched_update_nr_running(struct rq *rq, int count)
 {
         trace_sched_update_nr_running_tp(rq, count);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e5af311230be..4fffe05c8624 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1046,6 +1046,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 #endif
 	P(policy);
 	P(prio);
+	P(latency_nice);
 	if (task_has_dl_policy(p)) {
 		P(dl.runtime);
 		P(dl.deadline);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dafc7d8d9c8f..e90b9db85556 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5579,6 +5579,34 @@ static int sched_idle_cpu(int cpu)
 }
 #endif
 
+static void set_next_buddy(struct sched_entity *se);
+
+static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
+{
+	struct sched_entity *next;
+
+	if (se->latency_weight <= 0)
+		return;
+
+	if (cfs->nr_running <= 1)
+		return;
+	/*
+	 * When waking from idle, we don't need to check to preempt at wakeup
+	 * the idle thread and don't set next buddy as a candidate for being
+	 * picked in priority.
+	 * In case of simultaneous wakeup from idle, the latency sensitive tasks
+	 * lost opportunity to preempt non sensitive tasks which woke up
+	 * simultaneously.
+	 */
+	if (cfs->next)
+		next = cfs->next;
+	else
+		next = __pick_first_entity(cfs);
+
+	if (next && wakeup_preempt_entity(next, se) == 1)
+		set_next_buddy(se);
+}
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -5668,6 +5696,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_new)
 		update_overutilized_status(rq);
 
+	if (rq->curr == rq->idle)
+		check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
+
 enqueue_throttle:
 	if (cfs_bandwidth_used()) {
 		/*
@@ -7012,6 +7043,37 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 }
 #endif /* CONFIG_SMP */
 
+static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se)
+{
+	int latency_weight = se->latency_weight;
+	long thresh = sysctl_sched_latency;
+
+	/*
+	 * A positive latency weigth means that the sched_entity has latency
+	 * requirement that needs to be evaluated versus other entity.
+	 * Otherwise, use the latency weight to evaluate how much scheduling
+	 * delay is acceptable by se.
+	 */
+	if ((se->latency_weight > 0) || (curr->latency_weight > 0))
+		latency_weight -= curr->latency_weight;
+
+	if (!latency_weight)
+		return 0;
+
+	if (sched_feat(GENTLE_FAIR_SLEEPERS))
+		thresh >>= 1;
+
+	/*
+	 * Clamp the delta to stay in the scheduler period range
+	 * [-sysctl_sched_latency:sysctl_sched_latency]
+	 */
+	latency_weight = clamp_t(long, latency_weight,
+				 sched_latency_to_weight[NICE_WIDTH],
+				 sched_latency_to_weight[0]);
+
+	return (thresh * latency_weight) >> SCHED_FIXEDPOINT_SHIFT;
+}
+
 static unsigned long wakeup_gran(struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -7051,6 +7113,8 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 {
 	s64 gran, vdiff = curr->vruntime - se->vruntime;
 
+	vdiff += wakeup_latency_gran(curr, se);
+
 	if (vdiff <= 0)
 		return -1;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e4c65d96185e..31f71b7c0433 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1841,6 +1841,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 
 extern const int		sched_prio_to_weight[40];
 extern const u32		sched_prio_to_wmult[40];
+extern const int		sched_latency_to_weight[40];
 
 /*
  * {de,en}queue flags:
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
index 3bac0a8ceab2..b2e932c25be6 100644
--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
 #define SCHED_FLAG_KEEP_PARAMS		0x10
 #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
 #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
 
 #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
 				 SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
 			 SCHED_FLAG_RECLAIM		| \
 			 SCHED_FLAG_DL_OVERRUN		| \
 			 SCHED_FLAG_KEEP_ALL		| \
-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
 
 #endif /* _UAPI_LINUX_SCHED_H */
-- 
Gitee


From 4e5b8c900997988671fe2f4cd5fdbb81d5511dd0 Mon Sep 17 00:00:00 2001
From: chiliren <wangjingjin1@huawei.com>
Date: Fri, 24 Jun 2022 12:41:07 +0800
Subject: [PATCH 2/3] umcg

---
 arch/arm/tools/syscall.tbl           |   1 +
 arch/arm64/Kconfig                   |   1 +
 arch/arm64/include/asm/thread_info.h |  11 +
 arch/arm64/include/asm/unistd.h      |   5 +-
 arch/arm64/include/asm/unistd32.h    |   2 +
 arch/arm64/kernel/entry.S            |   2 +-
 arch/arm64/kernel/ptrace.c           |   6 +
 arch/arm64/kernel/signal.c           |   2 +
 arch/arm64/kernel/syscall.c          |   6 +-
 arch/arm64/mm/fault.c                |  41 ++-
 fs/exec.c                            |   1 +
 include/linux/sched.h                |  79 +++++
 include/uapi/linux/umcg.h            |  59 ++++
 init/Kconfig                         |  14 +
 kernel/exit.c                        |   4 +
 kernel/sched/Makefile                |   1 +
 kernel/sched/core.c                  |   8 +-
 kernel/sched/sched.h                 |   3 +
 kernel/sched/umcg.c                  | 446 +++++++++++++++++++++++++++
 kernel/sys_ni.c                      |   3 +
 mm/migrate.c                         |   9 +-
 mm/mprotect.c                        |   6 +
 22 files changed, 702 insertions(+), 8 deletions(-)
 create mode 100644 include/uapi/linux/umcg.h
 create mode 100644 kernel/sched/umcg.c

diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index d056a548358e..51a7cff4d60e 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -454,3 +454,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+451	common	umcg_ctl			sys_umcg_ctl
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 09c41d56fe78..ef0b3d46f6c4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -202,6 +202,7 @@ config ARM64
 	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
+	select HAVE_UMCG
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 1fbab854a51b..689168c46b9e 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -68,6 +68,7 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
 #define TIF_FSCHECK		5	/* Check FS is USER_DS on return */
 #define TIF_MTE_ASYNC_FAULT	6	/* MTE Asynchronous Tag Check Fault */
+#define TIF_UMCG                7       /* UMCG return to user hook */
 #define TIF_SYSCALL_TRACE	8	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	9	/* syscall auditing */
 #define TIF_SYSCALL_TRACEPOINT	10	/* syscall tracepoint for ftrace */
@@ -98,15 +99,25 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define _TIF_32BIT		(1 << TIF_32BIT)
 #define _TIF_SVE		(1 << TIF_SVE)
 #define _TIF_MTE_ASYNC_FAULT	(1 << TIF_MTE_ASYNC_FAULT)
+#define _TIF_UMCG               (1 << TIF_UMCG)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
 				 _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT)
 
+#define _TIF_WORK_MASK_UMCG	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
+				 _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT | _TIF_UMCG)
+
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
 				 _TIF_SYSCALL_EMU)
 
+#define _TIF_SYSCALL_WORK_UMCG	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
+				 _TIF_SYSCALL_EMU | _TIF_UMCG)
+
+
 #ifdef CONFIG_SHADOW_CALL_STACK
 #define INIT_SCS							\
 	.scs_base	= init_shadow_call_stack,			\
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index b3b2019f8d16..b85bd4084492 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -29,6 +29,9 @@
 #define __NR_compat_clock_getres	264
 #define __NR_compat_clock_gettime64	403
 #define __NR_compat_clock_getres_time64	406
+#define __NR_umcg_ctl			450
+#define __NR_umcg_wait			451
+#define __NR_umcg_kick			452
 
 /*
  * The following SVCs are ARM private.
@@ -38,7 +41,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		441
+#define __NR_compat_syscalls		453
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 107f08e03b9f..a9c6f02c73e3 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_umcg_ctl 450
+__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index d5bc1dbdd2fd..4e826059e6d0 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -751,7 +751,7 @@ SYM_CODE_START_LOCAL(ret_to_user)
 	bl	trace_hardirqs_off
 #endif
 	ldr	x19, [tsk, #TSK_TI_FLAGS]
-	and	x2, x19, #_TIF_WORK_MASK
+	and	x2, x19, #_TIF_WORK_MASK_UMCG
 	cbnz	x2, work_pending
 finish_ret_to_user:
 	user_enter_irqoff
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 2817e39881fe..b93825402a4d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1810,6 +1810,9 @@ int syscall_trace_enter(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, regs->syscallno);
 
+	if (test_thread_flag(TIF_UMCG))
+		umcg_sys_enter(regs, regs->syscallno);
+
 	audit_syscall_entry(regs->syscallno, regs->orig_x0, regs->regs[1],
 			    regs->regs[2], regs->regs[3]);
 
@@ -1828,6 +1831,9 @@ void syscall_trace_exit(struct pt_regs *regs)
 	if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP))
 		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
 
+	if (flags & _TIF_UMCG)
+		umcg_sys_exit(regs);
+
 	rseq_syscall(regs);
 }
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index e62005317ce2..37b4aa067e23 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -939,6 +939,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 			if (thread_flags & _TIF_SIGPENDING)
 				do_signal(regs);
 
+			if (thread_flags & _TIF_UMCG)
+				umcg_notify_resume(regs);
 			if (thread_flags & _TIF_NOTIFY_RESUME) {
 				tracehook_notify_resume(regs);
 				rseq_handle_notify_resume(NULL, regs);
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index befde0eaa5e7..d42536d723f8 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -55,7 +55,11 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
 
 static inline bool has_syscall_work(unsigned long flags)
 {
-	return unlikely(flags & _TIF_SYSCALL_WORK);
+	if (current->flags & PF_UMCG_WORKER) {
+		return unlikely(flags & _TIF_SYSCALL_WORK_UMCG);
+	} else {
+		return unlikely(flags & _TIF_SYSCALL_WORK);
+	}
 }
 
 int syscall_trace_enter(struct pt_regs *regs);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 795d224f184f..6a3e36c27be6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -448,6 +448,29 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
+/**
+ * irqentry_irq_enable - Conditionally enable IRQs from exceptions
+ *
+ * Common code for exceptions to (re)enable IRQs, typically done to allow
+ * from-user exceptions to schedule (since they run on the task stack).
+ */
+static inline void irqentry_irq_enable(struct pt_regs *regs)
+{
+	if (user_mode(regs) && (current->flags & PF_UMCG_WORKER))
+		umcg_sys_enter(regs, -1);
+}
+
+/**
+ * irqentry_irq_disable - Conditionally disable IRQs from exceptions
+ *
+ * Counterpart of irqentry_irq_enable().
+*/
+static inline void irqentry_irq_disable(struct pt_regs *regs)
+{
+	if (user_mode(regs) && (current->flags & PF_UMCG_WORKER))
+		umcg_sys_exit(regs);
+}
+
 static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 				   struct pt_regs *regs)
 {
@@ -456,6 +479,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	bool umcgFlag = false;
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -470,6 +494,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	if (user_mode(regs))
 		mm_flags |= FAULT_FLAG_USER;
 
+	irqentry_irq_enable(regs);
+	umcgFlag = true;
+
 	if (is_el0_instruction_abort(esr)) {
 		vm_flags = VM_EXEC;
 		mm_flags |= FAULT_FLAG_INSTRUCTION;
@@ -525,6 +552,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	if (fault_signal_pending(fault, regs)) {
 		if (!user_mode(regs))
 			goto no_context;
+		if (umcgFlag)
+			irqentry_irq_disable(regs);
 		return 0;
 	}
 
@@ -540,8 +569,11 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	 * Handle the "normal" (no error) case first.
 	 */
 	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
-			      VM_FAULT_BADACCESS))))
+			      VM_FAULT_BADACCESS)))) {
+		if (umcgFlag)
+			irqentry_irq_disable(regs);
 		return 0;
+	}
 
 	/*
 	 * If we are in kernel mode at this point, we have no context to
@@ -557,6 +589,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		 * oom-killed).
 		 */
 		pagefault_out_of_memory();
+		if (umcgFlag)
+			irqentry_irq_disable(regs);
 		return 0;
 	}
 
@@ -589,10 +623,15 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 				      inf->name);
 	}
 
+	if (umcgFlag)
+		irqentry_irq_disable(regs);
+
 	return 0;
 
 no_context:
 	__do_kernel_fault(addr, esr, regs);
+	if (umcgFlag)
+		irqentry_irq_disable(regs);
 	return 0;
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 72f8763b3ce9..6cbed4e71590 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1831,6 +1831,7 @@ static int bprm_execve(struct linux_binprm *bprm,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	rseq_execve(current);
+	umcg_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);
 	return retval;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c1dc8632ec3..e1be61ba2091 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -69,6 +69,7 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct io_uring_task;
+struct umcg_task;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -1312,6 +1313,19 @@ struct task_struct {
 	unsigned long rseq_event_mask;
 #endif
 
+#ifdef CONFIG_UMCG
+	/* setup by sys_umcg_ctrl() */
+	u32			umcg_flags;
+	struct umcg_task __user	*umcg_task;
+
+	/* setup by umcg_pin_enter() */
+	struct page		*umcg_page;
+
+	struct task_struct	*umcg_server;
+	struct umcg_task __user	*umcg_server_task;
+	struct page		*umcg_server_page;
+#endif
+
 	struct tlbflush_unmap_batch	tlb_ubc;
 
 	union {
@@ -1691,6 +1705,13 @@ extern struct pid *cad_pid;
 						 * I am cleaning dirty pages from some other bdi. */
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
+
+#ifdef CONFIG_UMCG
+#define PF_UMCG_WORKER		0x01000000	/* UMCG worker */
+#else
+#define PF_UMCG_WORKER		0x00000000
+#endif
+
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
@@ -2202,6 +2223,64 @@ static inline void rseq_execve(struct task_struct *t)
 
 #endif
 
+#ifdef CONFIG_UMCG
+extern void umcg_sys_enter(struct pt_regs *regs, long syscall);
+extern void umcg_sys_exit(struct pt_regs *regs);
+extern void umcg_worker_exit(void);
+extern void umcg_clear_child(struct task_struct *tsk);
+extern void umcg_notify_resume(struct pt_regs *regs);
+
+/* Called by bprm_execve() in fs/exec.c. */
+static inline void umcg_execve(struct task_struct *tsk)
+{
+	if (tsk->umcg_task)
+		umcg_clear_child(tsk);
+}
+
+/* Called by do_exit() in kernel/exit.c */
+static inline void umcg_handle_exit(void)
+{
+	if (current->flags & PF_UMCG_WORKER)
+		umcg_worker_exit();
+}
+
+/*
+ *umcg_wq_worker_[sleeping|running] are called in core.c by
+ *sched_submit_work() and sched_update_worker().
+ */
+extern void umcg_wq_worker_sleeping(struct task_struct *tsk);
+
+#else  /* CONFIG_UMCG */
+
+static inline void umcg_sys_enter(struct pt_regs *regs, long syscall)
+{
+}
+
+static inline void umcg_sys_exit(struct pt_regs *regs)
+{
+}
+
+static inline void umcg_clear_child(struct task_struct *tsk)
+{
+}
+
+static inline void umcg_wq_worker_sleeping(struct task_struct *tsk)
+{
+}
+
+static inline void umcg_execve(struct task_struct *tsk)
+{
+}
+
+static inline void umcg_handle_exit(void)
+{
+}
+static inline void umcg_notify_resume(struct pt_regs *regs)
+{
+}
+
+#endif
+
 #ifdef CONFIG_DEBUG_RSEQ
 
 void rseq_syscall(struct pt_regs *regs);
diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h
new file mode 100644
index 000000000000..9d6cba42094b
--- /dev/null
+++ b/include/uapi/linux/umcg.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_UMCG_H
+#define _UAPI_LINUX_UMCG_H
+
+#include <linux/types.h>
+
+/*
+ * UMCG: User Managed Concurrency Groups.
+ *
+ * Syscalls (see kernel/sched/umcg.c):
+ *      sys_umcg_ctl()  - register/unregister UMCG tasks;
+ *
+ * struct umcg_task (below): controls the state of UMCG tasks.
+ */
+
+#define UMCG_TASK_MASK			0x00ffU
+
+#define UMCG_TASK_ALIGN			64
+
+#define UMCG_TID_MASK			0x3fffffffU
+
+/**
+ * struct umcg_task - controls the state of UMCG tasks.
+ *
+ * The struct is aligned at 64 bytes to ensure that it fits into
+ * a single cache line.
+ */
+struct umcg_task {
+	__u32	server_tid;			/* r   */
+
+	/**
+	 * @workers_sum: count the number of workers which is bound with server
+	 * Read-only for the userspace
+	*/
+	__u32	workers_sum;			/* r   */
+
+	/**
+	 * @blocked_workers_cnt: count the number of blocked workers
+	 *
+	 * Read-only for the userspace
+	 */
+	__u32	blocked_workers_cnt;		/* r   */
+
+	__u32	__zero[3];
+} __attribute__((packed, aligned(UMCG_TASK_ALIGN)));
+
+/**
+ * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
+ * @UMCG_CTL_REGISTER:   register the current task as a UMCG task
+ * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
+ * @UMCG_CTL_WORKER:     register the current task as a UMCG worker
+ */
+enum umcg_ctl_flag {
+	UMCG_CTL_REGISTER	= 0x00001,
+	UMCG_CTL_UNREGISTER	= 0x00002,
+	UMCG_CTL_WORKER		= 0x10000,
+};
+
+#endif /* _UAPI_LINUX_UMCG_H */
diff --git a/init/Kconfig b/init/Kconfig
index 152e85bbf8c6..09a57c93cafe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1698,6 +1698,20 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config HAVE_UMCG
+	bool
+
+config UMCG
+	bool "Enable User Managed Concurrency Groups API"
+	depends on 64BIT
+	depends on HAVE_UMCG
+	default n
+	help
+	  Enable User Managed Concurrency Groups API, which form the basis
+	  for an in-process M:N userspace scheduling framework.
+	  At the moment this is an experimental/RFC feature that is not
+	  guaranteed to be backward-compatible.
+
 config KALLSYMS
 	bool "Load all symbols for debugging/ksymoops" if EXPERT
 	default y
diff --git a/kernel/exit.c b/kernel/exit.c
index 795e16ecc422..36ba2c72d1fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -728,6 +728,9 @@ void __noreturn do_exit(long code)
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
+	/* Turn off UMCG sched hooks. */
+	if (unlikely(tsk->flags & PF_UMCG_WORKER))
+		tsk->flags &= ~PF_UMCG_WORKER;
 
 	/*
 	 * If do_exit is called because this processes oopsed, it's possible
@@ -766,6 +769,7 @@ void __noreturn do_exit(long code)
 	io_uring_files_cancel(tsk->files);
 	exit_signals(tsk);  /* sets PF_EXITING */
 	sched_exit(tsk);
+	umcg_handle_exit();
 
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 1b4834073ae7..865ff3c21000 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
 obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o
 obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o
+obj-$(CONFIG_UMCG) += umcg.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc70adf89565..31a6cc2c5950 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2932,8 +2932,7 @@ static inline void walt_try_to_wake_up(struct task_struct *p)
  * Return: %true if @p->state changes (an actual wakeup was done),
  *	   %false otherwise.
  */
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, success = 0;
@@ -3219,6 +3218,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #ifdef CONFIG_SCHED_RTG
 	p->rtg_depth = 0;
 #endif
+	umcg_clear_child(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4722,10 +4722,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * in the possible wakeup of a kworker and because wq_worker_sleeping()
 	 * requires it.
 	 */
-	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_WQ_WORKER)) {
 		preempt_disable();
 		if (task_flags & PF_WQ_WORKER)
 			wq_worker_sleeping(tsk);
+		else if (task_flags & PF_UMCG_WORKER)
+			umcg_wq_worker_sleeping(tsk);
 		else
 			io_wq_worker_sleeping(tsk);
 		preempt_enable_no_resched();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31f71b7c0433..53292a08a737 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1826,6 +1826,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #define WF_FORK			0x02		/* Child wakeup after fork */
 #define WF_MIGRATED		0x04		/* Internal use, task got migrated */
 #define WF_ON_CPU		0x08		/* Wakee is on_cpu */
+#define WF_CURRENT_CPU  0x80 /* Prefer to move the wakee to the current CPU. */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -2794,6 +2795,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
 
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
 #ifdef CONFIG_SCHED_RTG
 extern bool task_fits_max(struct task_struct *p, int cpu);
 extern unsigned long capacity_spare_without(int cpu, struct task_struct *p);
diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
new file mode 100644
index 000000000000..50aade21560a
--- /dev/null
+++ b/kernel/sched/umcg.c
@@ -0,0 +1,446 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * User Managed Concurrency Groups (UMCG).
+ *
+ */
+
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/umcg.h>
+#include <linux/mm.h>
+#include <asm/syscall.h>
+#include <asm/ptrace.h>
+
+#include "sched.h"
+
+static struct task_struct *umcg_get_task(u32 tid)
+{
+	struct task_struct *tsk = NULL;
+
+	if (tid) {
+		rcu_read_lock();
+		tsk = find_task_by_vpid(tid & UMCG_TID_MASK);
+		if (tsk && current->mm == tsk->mm && tsk->umcg_task)
+			get_task_struct(tsk);
+		else
+			tsk = NULL;
+		rcu_read_unlock();
+	}
+
+	return tsk;
+}
+
+/*
+ * Pinning a page inhibits rmap based unmap for Anon pages. Doing a store
+ * through the user mapping ensures the user mapping exists and is writable.
+ */
+static int umcg_pin_page(struct umcg_task __user *self, struct page **pagep)
+{
+	int ret = -EFAULT;
+
+	if (pin_user_pages_fast((unsigned long)self, 1, FOLL_WRITE, pagep) != 1)
+		goto out;
+
+	if (!PageAnon(*pagep) ||
+		put_user(0ULL, &self->__zero[0])) {
+		unpin_user_page(*pagep);
+		goto out;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/**
+ * umcg_pin_pages: pin pages containing struct umcg_task of
+ *		   this task and its server (possibly this task again).
+ */
+static int umcg_pin_pages(void)
+{
+	struct task_struct *server = NULL, *tsk = current;
+	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
+	int server_tid;
+	int ret;
+
+	/* must not have stale state */
+	if (WARN_ON_ONCE(tsk->umcg_page ||
+			 tsk->umcg_server_page ||
+			 tsk->umcg_server_task ||
+			 tsk->umcg_server))
+		return -EBUSY;
+
+	ret = umcg_pin_page(self, &tsk->umcg_page);
+	if (ret)
+		goto clear_self;
+
+	if (get_user(server_tid, &self->server_tid))
+		goto unpin_self;
+
+	ret = -ESRCH;
+	server = umcg_get_task(server_tid);
+	if (!server)
+		goto unpin_self;
+
+	/* must cache due to possible concurrent change */
+	tsk->umcg_server_task = READ_ONCE(server->umcg_task);
+	ret = umcg_pin_page(tsk->umcg_server_task, &tsk->umcg_server_page);
+	if (ret)
+		goto clear_server;
+
+	tsk->umcg_server = server;
+
+	return 0;
+
+clear_server:
+	tsk->umcg_server_task = NULL;
+	tsk->umcg_server_page = NULL;
+
+unpin_self:
+	unpin_user_page(tsk->umcg_page);
+clear_self:
+	tsk->umcg_page = NULL;
+
+	return ret;
+}
+
+static void umcg_unpin_pages(void)
+{
+	struct task_struct *tsk = current;
+
+	if (tsk->umcg_server) {
+		unpin_user_page(tsk->umcg_page);
+		tsk->umcg_page = NULL;
+
+		unpin_user_page(tsk->umcg_server_page);
+		tsk->umcg_server_page = NULL;
+		tsk->umcg_server_task = NULL;
+
+		put_task_struct(tsk->umcg_server);
+		tsk->umcg_server = NULL;
+	}
+}
+
+static void umcg_clear_task(struct task_struct *tsk)
+{
+	/*
+	 * This is either called for the current task, or for a newly forked
+	 * task that is not yet running, so we don't need strict atomicity
+	 * below.
+	 */
+	if (tsk->umcg_task) {
+		WRITE_ONCE(tsk->umcg_task, NULL);
+
+		tsk->flags &= ~PF_UMCG_WORKER;
+		clear_tsk_thread_flag(tsk, TIF_UMCG);
+	}
+
+	tsk->umcg_page = NULL;
+
+	tsk->umcg_server = NULL;
+	tsk->umcg_server_page = NULL;
+	tsk->umcg_server_task = NULL;
+}
+
+/* Called for a forked or execve-ed child. */
+void umcg_clear_child(struct task_struct *tsk)
+{
+	umcg_clear_task(tsk);
+}
+
+/* Called both by normally (unregister) and abnormally exiting workers. */
+void umcg_worker_exit(void)
+{
+	umcg_unpin_pages();
+	umcg_clear_task(current);
+}
+
+#define __UMCG_DIE(stmt, reason)	do {				\
+	stmt;								\
+	pr_warn_ratelimited("%s: killing task %s/%d because: " reason "\n",\
+			    __func__, current->comm, current->pid);	\
+	force_sig(SIGKILL);						\
+	return;								\
+} while (0)
+
+#define UMCG_DIE(reason)	__UMCG_DIE(,reason)
+#define UMCG_DIE_PF(reason)	__UMCG_DIE(pagefault_enable(), reason)
+
+/* Called from syscall enter path and exceptions that can schedule */
+void umcg_sys_enter(struct pt_regs *regs, long syscall)
+{
+	/* avoid recursion vs our own syscalls */
+	if (syscall == __NR_umcg_ctl)
+		return;
+
+	/* avoid recursion vs schedule() */
+	current->flags &= ~PF_UMCG_WORKER;
+
+	/*
+	 * Pin all the state on sys_enter() such that we can rely on it
+	 * from dodgy contexts. It is either unpinned from pre-schedule()
+	 * or sys_exit(), whichever comes first, thereby ensuring the pin
+	 * is temporary.
+	 */
+	if (umcg_pin_pages())
+		UMCG_DIE("pin");
+
+	current->flags |= PF_UMCG_WORKER;
+}
+
+static int umcg_upd_workers_sum(struct task_struct *server, bool registed)
+{
+	struct umcg_task __user *self = server->umcg_task;
+	u32 sum;
+
+	if (!user_access_begin(self, sizeof(*self)))
+		return -EFAULT;
+
+	sum = self->workers_sum;
+	if (registed)
+		unsafe_put_user(++sum, &self->workers_sum, Efault);
+	else
+		unsafe_put_user(--sum, &self->workers_sum, Efault);
+
+	user_access_end();
+	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
+
+static int umcg_upd_blk_workers_cnt(struct task_struct *tsk, bool blocked)
+{
+	struct umcg_task __user *server = tsk->umcg_server_task;
+	struct umcg_task __user *self = tsk->umcg_task;
+	u32 sum;
+
+	/*
+	 * umcg_pin_pages() did access_ok() on both pointers, use self here
+	 * only because __user_access_begin() isn't available in generic code.
+	 */
+	if (!user_access_begin(self, sizeof(*self)))
+		return -EFAULT;
+
+	sum = server->blocked_workers_cnt;
+	if (blocked)
+		unsafe_put_user(++sum, &server->blocked_workers_cnt, Efault);
+	else
+		unsafe_put_user(--sum, &server->blocked_workers_cnt, Efault);
+
+	user_access_end();
+	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
+
+/* pre-schedule() */
+void umcg_wq_worker_sleeping(struct task_struct *tsk)
+{
+	if (!tsk->umcg_server || tsk->state) {
+		/*
+		 * Either this task blocked before, or SYSCALL_UMCG is
+		 * (temporarily) disabled (see umcg_notify_resume()). Either
+		 * way the pages are unpinned and there's nothing to do.
+		 */
+		return;
+	}
+
+	/* Must not fault, mmap_sem might be held. */
+	pagefault_disable();
+
+	if(umcg_upd_blk_workers_cnt(tsk, true))
+		UMCG_DIE_PF("upd_cnt");
+
+	pagefault_enable();
+
+	/*
+	 * We're going to sleep, make sure to unpin the pages, this ensures
+	 * the pins are temporary. Also see umcg_sys_exit().
+	 */
+	umcg_unpin_pages();
+}
+
+/* Called from syscall exit path and exceptions that can schedule */
+void umcg_sys_exit(struct pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+	long syscall = syscall_get_nr(tsk, regs);
+
+	if (syscall == __NR_umcg_ctl)
+		return;
+
+	if (tsk->umcg_server) {
+		/*
+		 * Didn't block, we done.
+		 */
+		umcg_unpin_pages();
+		return;
+	}
+}
+
+/* return-to-user path */
+void umcg_notify_resume(struct pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+	bool worker = tsk->flags & PF_UMCG_WORKER;
+
+	if (!worker)
+		return;
+
+	current->flags &= ~PF_UMCG_WORKER;
+
+	if (umcg_pin_pages())
+		UMCG_DIE("pin");
+
+	umcg_upd_blk_workers_cnt(tsk, false);
+
+	umcg_unpin_pages();
+
+	current->flags |= PF_UMCG_WORKER;
+}
+
+static int umcg_register(struct umcg_task __user *self, u32 flags)
+{
+	struct task_struct *server;
+	struct umcg_task ut;
+
+	current->umcg_flags = flags;
+
+	if (current->umcg_task || !self)
+		return -EINVAL;
+
+	if (copy_from_user(&ut, self, sizeof(ut)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	server = find_task_by_vpid(ut.server_tid);
+	if (server && server->mm == current->mm) {
+		if (flags == UMCG_CTL_WORKER) {
+			if (!server->umcg_task || (server->flags & PF_UMCG_WORKER))
+				server = NULL;
+		} else {
+			if (server != current)
+				server = NULL;
+		}
+	} else {
+		server = NULL;
+	}
+	rcu_read_unlock();
+
+	if (!server)
+		return -ESRCH;
+
+	if (flags == UMCG_CTL_WORKER) {
+		umcg_upd_workers_sum(server, true);
+		WRITE_ONCE(current->umcg_task, self);
+		current->flags |= PF_UMCG_WORKER;	/* hook schedule() */
+		set_thread_flag(TIF_UMCG);		/* hook return-to-user */
+	} else {
+		self->workers_sum = 0U;
+		WRITE_ONCE(current->umcg_task, self);
+		set_thread_flag(TIF_UMCG);		/* hook return-to-user */
+	}
+
+	return 0;
+}
+
+static int umcg_unregister(struct umcg_task __user *self, u32 flags)
+{
+	bool worker = current->flags & PF_UMCG_WORKER;
+	int ret;
+
+	if (!self || self != current->umcg_task)
+		return -EINVAL;
+
+	if (!worker != !(flags & UMCG_CTL_WORKER))
+		return -EINVAL;
+
+	current->flags &= ~PF_UMCG_WORKER;
+
+	ret = umcg_pin_pages();
+	if (ret) {
+		if (worker)
+			current->flags |= PF_UMCG_WORKER;
+		return ret;
+	}
+
+	ret = umcg_upd_workers_sum(current->umcg_server, false);
+	if (ret)
+		return ret;
+
+	umcg_unpin_pages();
+	umcg_clear_task(current);
+	return 0;
+}
+
+#define UMCG_CTL_CMD	0xff
+
+/**
+ * sys_umcg_ctl: (un)register the current task as a UMCG task.
+ * @flags:       ORed values from enum umcg_ctl_flag; see below;
+ * @self:        a pointer to struct umcg_task that describes this
+ *               task.
+ *
+ * @flags & UMCG_CTL_REGISTER: register a UMCG task:
+ *
+ *	UMCG workers:
+ *	 - @flags & UMCG_CTL_WORKER
+ *
+ *	UMCG servers:
+ *	 - !(@flags & UMCG_CTL_WORKER)
+ *
+ *	All tasks:
+ *	 - self->server_tid must be a valid server
+ *
+ *	If the conditions above are met, sys_umcg_ctl() immediately returns
+ *	if the registered task is a server. If the registered task is a
+ *	worker, it's server's workers_sum will be added.
+ *
+ * @flags & UMCG_CTL_UNREGISTER: unregister a UMCG task.
+ *
+ *	UMCG workers:
+ *	 - @flags & UMCG_CTL_WORKER
+ *
+ *	UMCG servers:
+ *	 - !(@flags & UMCG_CTL_WORKER)
+ *
+ *	All tasks:
+ *	 - self must match with UMCG_CTL_REGISTER
+ *
+ * Return:
+ * 0		- success
+ * -EFAULT	- failed to read @self
+ * -EINVAL	- some other error occurred
+ * -ESRCH	- no such server_tid
+ */
+SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self)
+{
+	int cmd = flags & UMCG_CTL_CMD;
+
+	if ((unsigned long)self % UMCG_TASK_ALIGN)
+		return -EINVAL;
+
+	flags &= ~UMCG_CTL_CMD;
+
+	if (flags & ~UMCG_CTL_WORKER)
+		return -EINVAL;
+
+	switch (cmd) {
+	case UMCG_CTL_REGISTER:
+		return umcg_register(self, flags);
+
+	case UMCG_CTL_UNREGISTER:
+		return umcg_unregister(self, flags);
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..5dbf0f9aabc4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -264,6 +264,9 @@ COND_SYSCALL(request_key);
 COND_SYSCALL(keyctl);
 COND_SYSCALL_COMPAT(keyctl);
 
+/* kernel/sched/umcg.c */
+COND_SYSCALL(umcg_ctl);
+
 /* arch/example/kernel/sys_example.c */
 
 /* mm/fadvise.c */
diff --git a/mm/migrate.c b/mm/migrate.c
index 278e6f3fa62c..2dfaf0f832e9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1449,7 +1449,14 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 			nr_subpages = thp_nr_pages(page);
 			cond_resched();
 
-			if (PageHuge(page))
+			/*
+			 * If the page has a pin then expected_page_refs() will
+			 * not match and the whole migration will fail later
+			 * anyway, fail early and preserve the mappings.
+			 */
+			if (page_maybe_dma_pinned(page))
+				rc = -EAGAIN;
+			else if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
 						put_new_page, private, page,
 						pass > 2, mode, reason);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 53b6b1b8fb67..d6934ccc4165 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -105,6 +105,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				if (page_is_file_lru(page) && PageDirty(page))
 					continue;
 
+				/*
+				 * Can't migarate pinned pages, avoid touching them.
+				 */
+				if (page_maybe_dma_pinned(page))
+					continue;
+
 				/*
 				 * Don't mess with PTEs if page is already on the node
 				 * a single-threaded process is running on.
-- 
Gitee


From bb488204e0cd61cbc3f2515d4033967fb450fe38 Mon Sep 17 00:00:00 2001
From: chiliren <wangjingjin1@huawei.com>
Date: Fri, 24 Jun 2022 12:45:22 +0800
Subject: [PATCH 3/3] del wait & kick

---
 arch/arm64/include/asm/unistd.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index b85bd4084492..70644d0b6168 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -30,8 +30,6 @@
 #define __NR_compat_clock_gettime64	403
 #define __NR_compat_clock_getres_time64	406
 #define __NR_umcg_ctl			450
-#define __NR_umcg_wait			451
-#define __NR_umcg_kick			452
 
 /*
  * The following SVCs are ARM private.
-- 
Gitee