From 1993c0e5fa396623f1480b4d77fcd46d1d4f54fc Mon Sep 17 00:00:00 2001
From: Wang Jingjin <wangjingjin1@huawei.com>
Date: Fri, 22 Jul 2022 17:58:57 +0800
Subject: [PATCH] sched: Worker Group Contrl Monitor

linux inclusion
category: feature
issue: #I5IH6R
CVE: NA

Signed-off-by: Wang Jingjin <wangjingjin1@huawei.com>

------------------------------------

Support thread pool management.

Using prctl() to register/unregister a thread as server or worker.
Once a worker blocked, the server will count the number of blocked
threads. User space can get the number of the blocked threads by
using prctl().

Signed-off-by: Wang Jingjin <wangjingjin1@huawei.com>
---
 fs/exec.c                    |   1 +
 include/linux/sched.h        |  44 ++++++++
 include/trace/events/sched.h |  17 +++
 include/uapi/linux/prctl.h   |   5 +
 include/uapi/linux/wgcm.h    |  61 ++++++++++
 init/Kconfig                 |   8 ++
 kernel/exit.c                |   2 +
 kernel/sched/Makefile        |   1 +
 kernel/sched/core.c          |  19 ++++
 kernel/sched/wgcm.c          | 212 +++++++++++++++++++++++++++++++++++
 kernel/sys.c                 |  12 ++
 11 files changed, 382 insertions(+)
 create mode 100644 include/uapi/linux/wgcm.h
 create mode 100644 kernel/sched/wgcm.c

diff --git a/fs/exec.c b/fs/exec.c
index 72f8763b3ce9..131bfbb88698 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1831,6 +1831,7 @@ static int bprm_execve(struct linux_binprm *bprm,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	rseq_execve(current);
+	wgcm_clear_child(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);
 	return retval;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6ae4d7ae5a3b..c015f78966af 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -69,6 +69,7 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct io_uring_task;
+struct wgcm_task;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -1305,6 +1306,12 @@ struct task_struct {
 	unsigned long rseq_event_mask;
 #endif
 
+#ifdef CONFIG_WGCM
+	unsigned long wgcm_flags;
+	struct wgcm_task *wgcm_task;
+	struct wgcm_task *wgcm_server_task;
+#endif
+
 	struct tlbflush_unmap_batch	tlb_ubc;
 
 	union {
@@ -1685,6 +1692,13 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
+
+#ifdef CONFIG_WGCM
+#define PF_WGCM_WORKER		0x01000000	/* WGCM worker*/
+#else
+#define PF_WGCM_WORKER		0x00000000
+#endif
+
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
@@ -2207,6 +2221,36 @@ static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+#ifdef CONFIG_WGCM
+extern int wgcm_ctl(unsigned long flags, unsigned long server_tid);
+extern void wgcm_do_exit(struct task_struct *tsk);
+extern void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active);
+extern int wgcm_get_taskinfo(struct wgcm_task __user *self);
+extern void wgcm_clear_child(struct task_struct *p);
+#else
+static inline int wgcm_ctl(unsigned long flags, unsigned long server_tid)
+{
+	return 0;
+}
+
+static inline void wgcm_do_exit(struct task_struct *tsk)
+{
+}
+
+static inline void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active)
+{
+}
+
+static inline int wgcm_get_taskinfo(struct wgcm_task __user *self)
+{
+	return 0;
+}
+
+static inline void wgcm_clear_child(struct task_struct *p)
+{
+}
+#endif
+
 const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
 char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
 int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dd5fff2bb1b2..fae73fcbf51d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -764,6 +764,23 @@ TRACE_EVENT(sched_isolate,
 );
 #endif
 
+TRACE_EVENT(tracing_mark_wgcm,
+	TP_PROTO(int pid, const char *name, bool trace_begin),
+	TP_ARGS(pid, name, trace_begin),
+	TP_STRUCT__entry(
+			__field(int, pid)
+			__string(trace_name, name)
+			__field(bool, trace_begin)
+	),
+	TP_fast_assign(
+			__entry->pid = pid;
+			__assign_str(trace_name, name);
+			__entry->trace_begin = trace_begin;
+	),
+	TP_printk("%s|%d|%s", __entry->trace_begin ? "B" : "E",
+		__entry->pid, __get_str(trace_name))
+);
+
 /*
  * Following tracepoints are not exported in tracefs and provide hooking
  * mechanisms only for testing and debugging purposes.
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index c190e220ff67..d5187a0ab628 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -247,6 +247,11 @@ struct prctl_mm_map {
 #define PR_SET_IO_FLUSHER		57
 #define PR_GET_IO_FLUSHER		58
 
+/* Register or unregister the current task as a WGCM task */
+#define PR_WGCM_CTL			59
+/* Userspace get wgcm_task's data */
+#define PR_GET_WGCM_TASK		60
+
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
diff --git a/include/uapi/linux/wgcm.h b/include/uapi/linux/wgcm.h
new file mode 100644
index 000000000000..695966c8c09d
--- /dev/null
+++ b/include/uapi/linux/wgcm.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_WGCM_H
+#define _UAPI_LINUX_WGCM_H
+
+#include <linux/types.h>
+
+/*
+ * WGCM: Workegroup Control Monitor.
+ *
+ * use sys_prctl() (see kernel/sys.c) :
+ *	wgcm_ctl():	register/unregister WGCM tasks.
+ *
+ */
+
+#define UMCG_TASK_ALIGN			64
+
+#define UMCG_TID_MASK			0x3fffffffU
+
+/**
+ * struct wgcm_task: controls the state of WGCM tasks.
+ *
+ * The struct is aligned at 64 bytes to ensure that it fits into
+ * a single cache line.
+ */
+struct wgcm_task {
+	/**
+	 * @server_tid: server's tid.
+	 */
+	__u32	server_tid;			/* r w */
+
+	/**
+	 * @workers_sum: count the number of workers which is bound with server
+
+	 * Read-only for the userspace
+	 */
+	atomic_t	workers_sum;			/* r   */
+
+	/**
+	 * @blk_workers_sum: count the number of block workers
+	 *
+	 * Read-only for the userspace
+	 */
+	atomic_t	blk_workers_sum;		/* r   */
+
+	__u32	__zero[1];
+
+} __attribute__((packed, aligned(UMCG_TASK_ALIGN)));
+
+/**
+ * enum wgcm_ctl_flag - flags to pass to wgcm_ctl()
+ * @WGCM_CTL_REGISTER:   register the current task as a WGCM task
+ * @WGCM_CTL_UNREGISTER: unregister the current task as a WGCM task
+ * @WGCM_CTL_WORKER:     register the current task as a WGCM worker
+ */
+enum wgcm_ctl_flag {
+	WGCM_CTL_REGISTER	= 0x0001,
+	WGCM_CTL_UNREGISTER	= 0x0002,
+	WGCM_CTL_WORKER		= 0x0100,
+};
+
+#endif /* _UAPI_LINUX_WGCM_H */
diff --git a/init/Kconfig b/init/Kconfig
index 2e5b9288081e..1ca74fe127a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1699,6 +1699,14 @@ config KALLSYMS
 	  symbolic stack backtraces. This increases the size of the kernel
 	  somewhat, as all symbols have to be loaded into the kernel image.
 
+config WGCM
+	bool "Enable Workgroup Control Monitor API"
+	depends on 64BIT
+	default n
+	help
+	  Enable Workgroup Control Monitor API, userspace can get the number of
+	  blocked threads.
+
 config KALLSYMS_ALL
 	bool "Include all symbols in kallsyms"
 	depends on DEBUG_KERNEL && KALLSYMS
diff --git a/kernel/exit.c b/kernel/exit.c
index 795e16ecc422..e870f93113a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -850,6 +850,8 @@ void __noreturn do_exit(long code)
 	if (tsk->task_frag.page)
 		put_page(tsk->task_frag.page);
 
+	wgcm_do_exit(tsk);
+
 	validate_creds_for_do_exit(tsk);
 
 	check_stack_usage();
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 1b4834073ae7..8290730d57c9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
 obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o
 obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o
+obj-$(CONFIG_WGCM) += wgcm.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 549334102718..e9b8d341a578 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1603,16 +1603,33 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
+	u32 old_state = p->on_rq;
+
 	enqueue_task(rq, p, flags);
 
 	p->on_rq = TASK_ON_RQ_QUEUED;
+
+	if (old_state == 0 && p->flags & PF_WGCM_WORKER) {
+		trace_tracing_mark_wgcm(current->tgid, "active_task", true);
+		wgcm_upd_blk_workers_sum(p, true);
+		trace_tracing_mark_wgcm(current->tgid, "", false);
+	}
+
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
+	u32 old_state = p->on_rq;
+
 	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
 
 	dequeue_task(rq, p, flags);
+
+	if (old_state != 0 && p->on_rq == 0 && p->flags & PF_WGCM_WORKER) {
+		trace_tracing_mark_wgcm(current->tgid, "deactive_task", true);
+		wgcm_upd_blk_workers_sum(p, false);
+		trace_tracing_mark_wgcm(current->tgid, "", false);
+	}
 }
 
 static inline int __normal_prio(int policy, int rt_prio, int nice)
@@ -3213,6 +3230,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #ifdef CONFIG_SCHED_RTG
 	p->rtg_depth = 0;
 #endif
+
+	wgcm_clear_child(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/wgcm.c b/kernel/sched/wgcm.c
new file mode 100644
index 000000000000..2f54fc962acd
--- /dev/null
+++ b/kernel/sched/wgcm.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* WGCM: Workergroup Control Monitor */
+
+#include <linux/wgcm.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include "sched.h"
+
+void wgcm_clear_child(struct task_struct *p)
+{
+	if (p->wgcm_task) {
+		WRITE_ONCE(p->wgcm_task, NULL);
+		p->flags &= ~PF_WGCM_WORKER;
+	}
+	p->wgcm_flags = 0;
+	p->wgcm_server_task = NULL;
+}
+
+
+static void wgcm_clear_task(struct task_struct *tsk)
+{
+	/*
+	 * This is either called for the current task, or for a newly forked
+	 * task that is not yet running, so we don't need strict atomicity
+	 * below.
+	 */
+	if (tsk->wgcm_task) {
+		kfree(tsk->wgcm_task);
+		tsk->flags &= ~PF_WGCM_WORKER;
+	}
+
+	tsk->wgcm_flags = 0;
+	tsk->wgcm_server_task = NULL;
+}
+
+void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active)
+{
+	struct wgcm_task *server = p->wgcm_server_task;
+
+	if (!server) {
+		pr_err("[WGCM]The WGCM worker is not bound to it's server");
+		return;
+	}
+
+	if (active)
+		atomic_dec(&server->blk_workers_sum);
+	else
+		atomic_inc(&server->blk_workers_sum);
+}
+
+static void wgcm_upd_workers_sum(struct wgcm_task *server, bool regist)
+{
+	if (regist)
+		atomic_inc(&server->workers_sum);
+	else
+		atomic_dec(&server->workers_sum);
+}
+
+void wgcm_do_exit(struct task_struct *tsk)
+{
+	struct wgcm_task *server = tsk->wgcm_server_task;
+	struct wgcm_task *self = tsk->wgcm_task;
+
+	if (!server)
+		return;
+
+	if (tsk->flags & PF_WGCM_WORKER)
+		wgcm_upd_workers_sum(server, false);
+
+	kfree(self);
+}
+
+int wgcm_get_taskinfo(struct wgcm_task __user *self)
+{
+	struct task_struct *tsk = current;
+	int ret;
+
+	if (tsk->flags & PF_WGCM_WORKER || !tsk->wgcm_task)
+		return -EINVAL;
+
+	ret = copy_to_user(self, tsk->wgcm_task, sizeof(*self));
+	if (ret) {
+		pr_err("[WGCM] wgcm_task copy to user fail, ret = %d.", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int wgcm_register(unsigned long flags, unsigned long server_tid)
+{
+	struct task_struct *server;
+	struct wgcm_task *wt;
+	struct task_struct *tsk = current;
+
+	tsk->wgcm_flags = flags;
+
+	if (tsk->wgcm_task || server_tid == 0) {
+		pr_err("[WGCM][PID:%d]server_tid = %ld", current->pid, server_tid);
+		return -EINVAL;
+	}
+
+	wt = kzalloc(sizeof(*wt), GFP_KERNEL);
+	if (!wt) {
+		pr_err("[WGCM_REG] alloc wgcm task fail!\n");
+		return -ENOMEM;
+	}
+
+	wt->server_tid = server_tid;
+
+	rcu_read_lock();
+	server = find_task_by_vpid(server_tid);
+	if (!server)
+		pr_err("[WGCM][PID:%d]find server(%d) fail", tsk->pid, server_tid);
+	if (server && server->mm == current->mm) {
+		if (flags == WGCM_CTL_WORKER) {
+			if (!server->wgcm_task || (server->flags & PF_WGCM_WORKER))
+				server = NULL;
+		} else {
+			if (server != current)
+				server = NULL;
+		}
+	} else {
+		server = NULL;
+	}
+	rcu_read_unlock();
+
+	if (!server) {
+		kfree(wt);
+		return -ESRCH;
+	}
+
+	if (flags == WGCM_CTL_WORKER) {
+		WRITE_ONCE(tsk->wgcm_task, wt);
+		WRITE_ONCE(tsk->wgcm_server_task, server->wgcm_task);
+		wgcm_upd_workers_sum(tsk->wgcm_server_task, true);
+		current->flags |= PF_WGCM_WORKER;	/* hook schedule() */
+	} else {
+		WRITE_ONCE(tsk->wgcm_task, wt);
+	}
+
+	return 0;
+}
+
+static int wgcm_unregister(void)
+{
+	if (current->wgcm_server_task)
+		wgcm_upd_workers_sum(current->wgcm_server_task, false);
+
+	wgcm_clear_task(current);
+	return 0;
+}
+
+#define WGCM_CTL_CMD	0xff
+
+/**
+ * wgcm_ctl: (un)register the current task as a WGCM task.
+ * @flags:       ORed values from enum umcg_ctl_flag; see below;
+ * @server_tid:  server's(monitor's) thread id.
+ *
+ * @flags & WGCM_CTL_REGISTER: register a WGCM task:
+ *
+ * @flags & WGCM_CTL_UNREGISTER: unregister a WGCM task.
+ *
+ *	WGCM workers:
+ *	 - @flags & WGCM_CTL_WORKER
+ *
+ *	WGCM server:
+ *	 - !(@flags & WGCM_CTL_WORKER)
+ *
+ *	All tasks:
+ *	 - server_tid must be valid(e.g. not zero).
+ *
+ *	If the conditions above are met, wgcm_ctl() immediately returns
+ *	if the registered task is a server. If the registered task is a
+ *	worker, it's server's workers_sum will be added. Conversely, if
+ *	the unregisted task is a worker, it's server's workers_sum will
+ *	be decreased.
+ *
+ * Return:
+ * 0		- success
+ * -EFAULT	- failed to read @self
+ * -EINVAL	- some other error occurred
+ * -ESRCH	- no such server_tid
+ */
+int wgcm_ctl(unsigned long flags, unsigned long server_tid)
+{
+	int cmd = flags & WGCM_CTL_CMD;
+
+	flags &= ~WGCM_CTL_CMD;
+
+	pr_err("[WGCM][PID:%d] wgcm_ctl success.cmd = %u, flags = %u", current->pid, cmd, flags);
+
+	if (flags & ~WGCM_CTL_WORKER) {
+		return -EINVAL;
+	}
+
+	switch (cmd) {
+	case WGCM_CTL_REGISTER:
+		return wgcm_register(flags, server_tid);
+
+	case WGCM_CTL_UNREGISTER:
+		return wgcm_unregister();
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index c63de71889bf..1433fb59a265 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2588,6 +2588,18 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
 		break;
+	case PR_WGCM_CTL:
+		pr_err("[PID:%d]arg2=%lu, arg3=%lu, arg4=%lu, arg5=%lu\n",
+			current->pid, arg2, arg3, arg4, arg5);
+		if (arg4 || arg5)
+			return -EINVAL;
+
+		return wgcm_ctl(arg2, arg3);
+	case PR_GET_WGCM_TASK:
+		if (!arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+
+		return wgcm_get_taskinfo((struct wgcm_task __user *)arg2);
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
-- 
Gitee