From 1993c0e5fa396623f1480b4d77fcd46d1d4f54fc Mon Sep 17 00:00:00 2001 From: Wang Jingjin Date: Fri, 22 Jul 2022 17:58:57 +0800 Subject: [PATCH] sched: Worker Group Contrl Monitor linux inclusion category: feature issue: #I5IH6R CVE: NA Signed-off-by: Wang Jingjin ------------------------------------ Support thread pool management. Using prctl() to register/unregister a thread as server or worker. Once a worker blocked, the server will count the number of blocked threads. User space can get the number of the blocked threads by using prctl(). Signed-off-by: Wang Jingjin --- fs/exec.c | 1 + include/linux/sched.h | 44 ++++++++ include/trace/events/sched.h | 17 +++ include/uapi/linux/prctl.h | 5 + include/uapi/linux/wgcm.h | 61 ++++++++++ init/Kconfig | 8 ++ kernel/exit.c | 2 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 19 ++++ kernel/sched/wgcm.c | 212 +++++++++++++++++++++++++++++++++++ kernel/sys.c | 12 ++ 11 files changed, 382 insertions(+) create mode 100644 include/uapi/linux/wgcm.h create mode 100644 kernel/sched/wgcm.c diff --git a/fs/exec.c b/fs/exec.c index 72f8763b3ce9..131bfbb88698 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1831,6 +1831,7 @@ static int bprm_execve(struct linux_binprm *bprm, current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); + wgcm_clear_child(current); acct_update_integrals(current); task_numa_free(current, false); return retval; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6ae4d7ae5a3b..c015f78966af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -69,6 +69,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct io_uring_task; +struct wgcm_task; /* * Task state bitmask. NOTE! These bits are also @@ -1305,6 +1306,12 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_WGCM + unsigned long wgcm_flags; + struct wgcm_task *wgcm_task; + struct wgcm_task *wgcm_server_task; +#endif + struct tlbflush_unmap_batch tlb_ubc; union { @@ -1685,6 +1692,13 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + +#ifdef CONFIG_WGCM +#define PF_WGCM_WORKER 0x01000000 /* WGCM worker*/ +#else +#define PF_WGCM_WORKER 0x00000000 +#endif + #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ @@ -2207,6 +2221,36 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +#ifdef CONFIG_WGCM +extern int wgcm_ctl(unsigned long flags, unsigned long server_tid); +extern void wgcm_do_exit(struct task_struct *tsk); +extern void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active); +extern int wgcm_get_taskinfo(struct wgcm_task __user *self); +extern void wgcm_clear_child(struct task_struct *p); +#else +static inline int wgcm_ctl(unsigned long flags, unsigned long server_tid) +{ + return 0; +} + +static inline void wgcm_do_exit(struct task_struct *tsk) +{ +} + +static inline void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active) +{ +} + +static inline int wgcm_get_taskinfo(struct wgcm_task __user *self) +{ + return 0; +} + +static inline void wgcm_clear_child(struct task_struct *p) +{ +} +#endif + const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dd5fff2bb1b2..fae73fcbf51d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -764,6 +764,23 @@ TRACE_EVENT(sched_isolate, ); #endif +TRACE_EVENT(tracing_mark_wgcm, + TP_PROTO(int pid, const char *name, bool trace_begin), + TP_ARGS(pid, name, trace_begin), + TP_STRUCT__entry( + __field(int, pid) + __string(trace_name, name) + __field(bool, trace_begin) + ), + TP_fast_assign( + __entry->pid = pid; + __assign_str(trace_name, name); + __entry->trace_begin = trace_begin; + ), + TP_printk("%s|%d|%s", __entry->trace_begin ? "B" : "E", + __entry->pid, __get_str(trace_name)) +); + /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c190e220ff67..d5187a0ab628 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,6 +247,11 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Register or unregister the current task as a WGCM task */ +#define PR_WGCM_CTL 59 +/* Userspace get wgcm_task's data */ +#define PR_GET_WGCM_TASK 60 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/include/uapi/linux/wgcm.h b/include/uapi/linux/wgcm.h new file mode 100644 index 000000000000..695966c8c09d --- /dev/null +++ b/include/uapi/linux/wgcm.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_WGCM_H +#define _UAPI_LINUX_WGCM_H + +#include + +/* + * WGCM: Workegroup Control Monitor. + * + * use sys_prctl() (see kernel/sys.c) : + * wgcm_ctl(): register/unregister WGCM tasks. + * + */ + +#define UMCG_TASK_ALIGN 64 + +#define UMCG_TID_MASK 0x3fffffffU + +/** + * struct wgcm_task: controls the state of WGCM tasks. + * + * The struct is aligned at 64 bytes to ensure that it fits into + * a single cache line. + */ +struct wgcm_task { + /** + * @server_tid: server's tid. + */ + __u32 server_tid; /* r w */ + + /** + * @workers_sum: count the number of workers which is bound with server + + * Read-only for the userspace + */ + atomic_t workers_sum; /* r */ + + /** + * @blk_workers_sum: count the number of block workers + * + * Read-only for the userspace + */ + atomic_t blk_workers_sum; /* r */ + + __u32 __zero[1]; + +} __attribute__((packed, aligned(UMCG_TASK_ALIGN))); + +/** + * enum wgcm_ctl_flag - flags to pass to wgcm_ctl() + * @WGCM_CTL_REGISTER: register the current task as a WGCM task + * @WGCM_CTL_UNREGISTER: unregister the current task as a WGCM task + * @WGCM_CTL_WORKER: register the current task as a WGCM worker + */ +enum wgcm_ctl_flag { + WGCM_CTL_REGISTER = 0x0001, + WGCM_CTL_UNREGISTER = 0x0002, + WGCM_CTL_WORKER = 0x0100, +}; + +#endif /* _UAPI_LINUX_WGCM_H */ diff --git a/init/Kconfig b/init/Kconfig index 2e5b9288081e..1ca74fe127a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1699,6 +1699,14 @@ config KALLSYMS symbolic stack backtraces. This increases the size of the kernel somewhat, as all symbols have to be loaded into the kernel image. +config WGCM + bool "Enable Workgroup Control Monitor API" + depends on 64BIT + default n + help + Enable Workgroup Control Monitor API, userspace can get the number of + blocked threads. + config KALLSYMS_ALL bool "Include all symbols in kallsyms" depends on DEBUG_KERNEL && KALLSYMS diff --git a/kernel/exit.c b/kernel/exit.c index 795e16ecc422..e870f93113a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -850,6 +850,8 @@ void __noreturn do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); + wgcm_do_exit(tsk); + validate_creds_for_do_exit(tsk); check_stack_usage(); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1b4834073ae7..8290730d57c9 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o +obj-$(CONFIG_WGCM) += wgcm.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 549334102718..e9b8d341a578 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1603,16 +1603,33 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { + u32 old_state = p->on_rq; + enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; + + if (old_state == 0 && p->flags & PF_WGCM_WORKER) { + trace_tracing_mark_wgcm(current->tgid, "active_task", true); + wgcm_upd_blk_workers_sum(p, true); + trace_tracing_mark_wgcm(current->tgid, "", false); + } + } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + u32 old_state = p->on_rq; + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; dequeue_task(rq, p, flags); + + if (old_state != 0 && p->on_rq == 0 && p->flags & PF_WGCM_WORKER) { + trace_tracing_mark_wgcm(current->tgid, "deactive_task", true); + wgcm_upd_blk_workers_sum(p, false); + trace_tracing_mark_wgcm(current->tgid, "", false); + } } static inline int __normal_prio(int policy, int rt_prio, int nice) @@ -3213,6 +3230,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHED_RTG p->rtg_depth = 0; #endif + + wgcm_clear_child(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); diff --git a/kernel/sched/wgcm.c b/kernel/sched/wgcm.c new file mode 100644 index 000000000000..2f54fc962acd --- /dev/null +++ b/kernel/sched/wgcm.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* WGCM: Workergroup Control Monitor */ + +#include +#include +#include + +#include "sched.h" + +void wgcm_clear_child(struct task_struct *p) +{ + if (p->wgcm_task) { + WRITE_ONCE(p->wgcm_task, NULL); + p->flags &= ~PF_WGCM_WORKER; + } + p->wgcm_flags = 0; + p->wgcm_server_task = NULL; +} + + +static void wgcm_clear_task(struct task_struct *tsk) +{ + /* + * This is either called for the current task, or for a newly forked + * task that is not yet running, so we don't need strict atomicity + * below. + */ + if (tsk->wgcm_task) { + kfree(tsk->wgcm_task); + tsk->flags &= ~PF_WGCM_WORKER; + } + + tsk->wgcm_flags = 0; + tsk->wgcm_server_task = NULL; +} + +void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active) +{ + struct wgcm_task *server = p->wgcm_server_task; + + if (!server) { + pr_err("[WGCM]The WGCM worker is not bound to it's server"); + return; + } + + if (active) + atomic_dec(&server->blk_workers_sum); + else + atomic_inc(&server->blk_workers_sum); +} + +static void wgcm_upd_workers_sum(struct wgcm_task *server, bool regist) +{ + if (regist) + atomic_inc(&server->workers_sum); + else + atomic_dec(&server->workers_sum); +} + +void wgcm_do_exit(struct task_struct *tsk) +{ + struct wgcm_task *server = tsk->wgcm_server_task; + struct wgcm_task *self = tsk->wgcm_task; + + if (!server) + return; + + if (tsk->flags & PF_WGCM_WORKER) + wgcm_upd_workers_sum(server, false); + + kfree(self); +} + +int wgcm_get_taskinfo(struct wgcm_task __user *self) +{ + struct task_struct *tsk = current; + int ret; + + if (tsk->flags & PF_WGCM_WORKER || !tsk->wgcm_task) + return -EINVAL; + + ret = copy_to_user(self, tsk->wgcm_task, sizeof(*self)); + if (ret) { + pr_err("[WGCM] wgcm_task copy to user fail, ret = %d.", ret); + return ret; + } + + return 0; +} + +static int wgcm_register(unsigned long flags, unsigned long server_tid) +{ + struct task_struct *server; + struct wgcm_task *wt; + struct task_struct *tsk = current; + + tsk->wgcm_flags = flags; + + if (tsk->wgcm_task || server_tid == 0) { + pr_err("[WGCM][PID:%d]server_tid = %ld", current->pid, server_tid); + return -EINVAL; + } + + wt = kzalloc(sizeof(*wt), GFP_KERNEL); + if (!wt) { + pr_err("[WGCM_REG] alloc wgcm task fail!\n"); + return -ENOMEM; + } + + wt->server_tid = server_tid; + + rcu_read_lock(); + server = find_task_by_vpid(server_tid); + if (!server) + pr_err("[WGCM][PID:%d]find server(%d) fail", tsk->pid, server_tid); + if (server && server->mm == current->mm) { + if (flags == WGCM_CTL_WORKER) { + if (!server->wgcm_task || (server->flags & PF_WGCM_WORKER)) + server = NULL; + } else { + if (server != current) + server = NULL; + } + } else { + server = NULL; + } + rcu_read_unlock(); + + if (!server) { + kfree(wt); + return -ESRCH; + } + + if (flags == WGCM_CTL_WORKER) { + WRITE_ONCE(tsk->wgcm_task, wt); + WRITE_ONCE(tsk->wgcm_server_task, server->wgcm_task); + wgcm_upd_workers_sum(tsk->wgcm_server_task, true); + current->flags |= PF_WGCM_WORKER; /* hook schedule() */ + } else { + WRITE_ONCE(tsk->wgcm_task, wt); + } + + return 0; +} + +static int wgcm_unregister(void) +{ + if (current->wgcm_server_task) + wgcm_upd_workers_sum(current->wgcm_server_task, false); + + wgcm_clear_task(current); + return 0; +} + +#define WGCM_CTL_CMD 0xff + +/** + * wgcm_ctl: (un)register the current task as a WGCM task. + * @flags: ORed values from enum umcg_ctl_flag; see below; + * @server_tid: server's(monitor's) thread id. + * + * @flags & WGCM_CTL_REGISTER: register a WGCM task: + * + * @flags & WGCM_CTL_UNREGISTER: unregister a WGCM task. + * + * WGCM workers: + * - @flags & WGCM_CTL_WORKER + * + * WGCM server: + * - !(@flags & WGCM_CTL_WORKER) + * + * All tasks: + * - server_tid must be valid(e.g. not zero). + * + * If the conditions above are met, wgcm_ctl() immediately returns + * if the registered task is a server. If the registered task is a + * worker, it's server's workers_sum will be added. Conversely, if + * the unregisted task is a worker, it's server's workers_sum will + * be decreased. + * + * Return: + * 0 - success + * -EFAULT - failed to read @self + * -EINVAL - some other error occurred + * -ESRCH - no such server_tid + */ +int wgcm_ctl(unsigned long flags, unsigned long server_tid) +{ + int cmd = flags & WGCM_CTL_CMD; + + flags &= ~WGCM_CTL_CMD; + + pr_err("[WGCM][PID:%d] wgcm_ctl success.cmd = %u, flags = %u", current->pid, cmd, flags); + + if (flags & ~WGCM_CTL_WORKER) { + return -EINVAL; + } + + switch (cmd) { + case WGCM_CTL_REGISTER: + return wgcm_register(flags, server_tid); + + case WGCM_CTL_UNREGISTER: + return wgcm_unregister(); + + default: + break; + } + + return -EINVAL; +} diff --git a/kernel/sys.c b/kernel/sys.c index c63de71889bf..1433fb59a265 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2588,6 +2588,18 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_WGCM_CTL: + pr_err("[PID:%d]arg2=%lu, arg3=%lu, arg4=%lu, arg5=%lu\n", + current->pid, arg2, arg3, arg4, arg5); + if (arg4 || arg5) + return -EINVAL; + + return wgcm_ctl(arg2, arg3); + case PR_GET_WGCM_TASK: + if (!arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return wgcm_get_taskinfo((struct wgcm_task __user *)arg2); case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; -- Gitee