diff --git a/fs/exec.c b/fs/exec.c index 72f8763b3ce9cd154b274b6afdd9be624213cec5..131bfbb88698cc9339a1e0fd5669d497bdeeb698 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1831,6 +1831,7 @@ static int bprm_execve(struct linux_binprm *bprm, current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); + wgcm_clear_child(current); acct_update_integrals(current); task_numa_free(current, false); return retval; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6ae4d7ae5a3bfa6f848e05475c5343b1b0444001..c015f78966af556ee451281970b11ea3616ec4f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -69,6 +69,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct io_uring_task; +struct wgcm_task; /* * Task state bitmask. NOTE! These bits are also @@ -1305,6 +1306,12 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_WGCM + unsigned long wgcm_flags; + struct wgcm_task *wgcm_task; + struct wgcm_task *wgcm_server_task; +#endif + struct tlbflush_unmap_batch tlb_ubc; union { @@ -1685,6 +1692,13 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + +#ifdef CONFIG_WGCM +#define PF_WGCM_WORKER 0x01000000 /* WGCM worker*/ +#else +#define PF_WGCM_WORKER 0x00000000 +#endif + #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ @@ -2207,6 +2221,36 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +#ifdef CONFIG_WGCM +extern int wgcm_ctl(unsigned long flags, unsigned long server_tid); +extern void wgcm_do_exit(struct task_struct *tsk); +extern void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active); +extern int wgcm_get_taskinfo(struct wgcm_task __user *self); +extern void wgcm_clear_child(struct task_struct *p); +#else +static inline int wgcm_ctl(unsigned long flags, unsigned long server_tid) +{ + return 0; +} + +static inline void wgcm_do_exit(struct task_struct *tsk) +{ +} + +static inline void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active) +{ +} + +static inline int wgcm_get_taskinfo(struct wgcm_task __user *self) +{ + return 0; +} + +static inline void wgcm_clear_child(struct task_struct *p) +{ +} +#endif + const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dd5fff2bb1b21f58919ac9fd34721295d1a02024..fae73fcbf51d31c09348addacab47f3544188463 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -764,6 +764,23 @@ TRACE_EVENT(sched_isolate, ); #endif +TRACE_EVENT(tracing_mark_wgcm, + TP_PROTO(int pid, const char *name, bool trace_begin), + TP_ARGS(pid, name, trace_begin), + TP_STRUCT__entry( + __field(int, pid) + __string(trace_name, name) + __field(bool, trace_begin) + ), + TP_fast_assign( + __entry->pid = pid; + __assign_str(trace_name, name); + __entry->trace_begin = trace_begin; + ), + TP_printk("%s|%d|%s", __entry->trace_begin ? "B" : "E", + __entry->pid, __get_str(trace_name)) +); + /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c190e220ff6761aa55eebb2f51d41b62a6f99fb7..d5187a0ab6280b75463644e2af1da033928a9a63 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,6 +247,11 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Register or unregister the current task as a WGCM task */ +#define PR_WGCM_CTL 59 +/* Userspace get wgcm_task's data */ +#define PR_GET_WGCM_TASK 60 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/include/uapi/linux/wgcm.h b/include/uapi/linux/wgcm.h new file mode 100644 index 0000000000000000000000000000000000000000..695966c8c09dd68f44cd9229adc013cfce898ea7 --- /dev/null +++ b/include/uapi/linux/wgcm.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_WGCM_H +#define _UAPI_LINUX_WGCM_H + +#include + +/* + * WGCM: Workegroup Control Monitor. + * + * use sys_prctl() (see kernel/sys.c) : + * wgcm_ctl(): register/unregister WGCM tasks. + * + */ + +#define UMCG_TASK_ALIGN 64 + +#define UMCG_TID_MASK 0x3fffffffU + +/** + * struct wgcm_task: controls the state of WGCM tasks. + * + * The struct is aligned at 64 bytes to ensure that it fits into + * a single cache line. + */ +struct wgcm_task { + /** + * @server_tid: server's tid. + */ + __u32 server_tid; /* r w */ + + /** + * @workers_sum: count the number of workers which is bound with server + + * Read-only for the userspace + */ + atomic_t workers_sum; /* r */ + + /** + * @blk_workers_sum: count the number of block workers + * + * Read-only for the userspace + */ + atomic_t blk_workers_sum; /* r */ + + __u32 __zero[1]; + +} __attribute__((packed, aligned(UMCG_TASK_ALIGN))); + +/** + * enum wgcm_ctl_flag - flags to pass to wgcm_ctl() + * @WGCM_CTL_REGISTER: register the current task as a WGCM task + * @WGCM_CTL_UNREGISTER: unregister the current task as a WGCM task + * @WGCM_CTL_WORKER: register the current task as a WGCM worker + */ +enum wgcm_ctl_flag { + WGCM_CTL_REGISTER = 0x0001, + WGCM_CTL_UNREGISTER = 0x0002, + WGCM_CTL_WORKER = 0x0100, +}; + +#endif /* _UAPI_LINUX_WGCM_H */ diff --git a/init/Kconfig b/init/Kconfig index 2e5b9288081ed080de8b63358bb840775e98a528..1ca74fe127a51834a8444ea01aada50ee69a040c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1699,6 +1699,14 @@ config KALLSYMS symbolic stack backtraces. This increases the size of the kernel somewhat, as all symbols have to be loaded into the kernel image. +config WGCM + bool "Enable Workgroup Control Monitor API" + depends on 64BIT + default n + help + Enable Workgroup Control Monitor API, userspace can get the number of + blocked threads. + config KALLSYMS_ALL bool "Include all symbols in kallsyms" depends on DEBUG_KERNEL && KALLSYMS diff --git a/kernel/exit.c b/kernel/exit.c index 795e16ecc422a09980726dc1c2f179b6023b4746..e870f93113a5c1cf7f5b26ed565867d2e10eaf8d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -850,6 +850,8 @@ void __noreturn do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); + wgcm_do_exit(tsk); + validate_creds_for_do_exit(tsk); check_stack_usage(); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1b4834073ae75a670e03dca916ada9c58bab95bf..8290730d57c94177d19001887d31a749fa11843a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o +obj-$(CONFIG_WGCM) += wgcm.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 549334102718b146e07addc691d75a6346c4b369..e9b8d341a57882a7ab1da1cb059c5f5411d4269a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1603,16 +1603,33 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { + u32 old_state = p->on_rq; + enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; + + if (old_state == 0 && p->flags & PF_WGCM_WORKER) { + trace_tracing_mark_wgcm(current->tgid, "active_task", true); + wgcm_upd_blk_workers_sum(p, true); + trace_tracing_mark_wgcm(current->tgid, "", false); + } + } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + u32 old_state = p->on_rq; + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; dequeue_task(rq, p, flags); + + if (old_state != 0 && p->on_rq == 0 && p->flags & PF_WGCM_WORKER) { + trace_tracing_mark_wgcm(current->tgid, "deactive_task", true); + wgcm_upd_blk_workers_sum(p, false); + trace_tracing_mark_wgcm(current->tgid, "", false); + } } static inline int __normal_prio(int policy, int rt_prio, int nice) @@ -3213,6 +3230,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHED_RTG p->rtg_depth = 0; #endif + + wgcm_clear_child(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); diff --git a/kernel/sched/wgcm.c b/kernel/sched/wgcm.c new file mode 100644 index 0000000000000000000000000000000000000000..2f54fc962acd91487b8bf2edac55f9afa3a5ca97 --- /dev/null +++ b/kernel/sched/wgcm.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* WGCM: Workergroup Control Monitor */ + +#include +#include +#include + +#include "sched.h" + +void wgcm_clear_child(struct task_struct *p) +{ + if (p->wgcm_task) { + WRITE_ONCE(p->wgcm_task, NULL); + p->flags &= ~PF_WGCM_WORKER; + } + p->wgcm_flags = 0; + p->wgcm_server_task = NULL; +} + + +static void wgcm_clear_task(struct task_struct *tsk) +{ + /* + * This is either called for the current task, or for a newly forked + * task that is not yet running, so we don't need strict atomicity + * below. + */ + if (tsk->wgcm_task) { + kfree(tsk->wgcm_task); + tsk->flags &= ~PF_WGCM_WORKER; + } + + tsk->wgcm_flags = 0; + tsk->wgcm_server_task = NULL; +} + +void wgcm_upd_blk_workers_sum(struct task_struct *p, bool active) +{ + struct wgcm_task *server = p->wgcm_server_task; + + if (!server) { + pr_err("[WGCM]The WGCM worker is not bound to it's server"); + return; + } + + if (active) + atomic_dec(&server->blk_workers_sum); + else + atomic_inc(&server->blk_workers_sum); +} + +static void wgcm_upd_workers_sum(struct wgcm_task *server, bool regist) +{ + if (regist) + atomic_inc(&server->workers_sum); + else + atomic_dec(&server->workers_sum); +} + +void wgcm_do_exit(struct task_struct *tsk) +{ + struct wgcm_task *server = tsk->wgcm_server_task; + struct wgcm_task *self = tsk->wgcm_task; + + if (!server) + return; + + if (tsk->flags & PF_WGCM_WORKER) + wgcm_upd_workers_sum(server, false); + + kfree(self); +} + +int wgcm_get_taskinfo(struct wgcm_task __user *self) +{ + struct task_struct *tsk = current; + int ret; + + if (tsk->flags & PF_WGCM_WORKER || !tsk->wgcm_task) + return -EINVAL; + + ret = copy_to_user(self, tsk->wgcm_task, sizeof(*self)); + if (ret) { + pr_err("[WGCM] wgcm_task copy to user fail, ret = %d.", ret); + return ret; + } + + return 0; +} + +static int wgcm_register(unsigned long flags, unsigned long server_tid) +{ + struct task_struct *server; + struct wgcm_task *wt; + struct task_struct *tsk = current; + + tsk->wgcm_flags = flags; + + if (tsk->wgcm_task || server_tid == 0) { + pr_err("[WGCM][PID:%d]server_tid = %ld", current->pid, server_tid); + return -EINVAL; + } + + wt = kzalloc(sizeof(*wt), GFP_KERNEL); + if (!wt) { + pr_err("[WGCM_REG] alloc wgcm task fail!\n"); + return -ENOMEM; + } + + wt->server_tid = server_tid; + + rcu_read_lock(); + server = find_task_by_vpid(server_tid); + if (!server) + pr_err("[WGCM][PID:%d]find server(%d) fail", tsk->pid, server_tid); + if (server && server->mm == current->mm) { + if (flags == WGCM_CTL_WORKER) { + if (!server->wgcm_task || (server->flags & PF_WGCM_WORKER)) + server = NULL; + } else { + if (server != current) + server = NULL; + } + } else { + server = NULL; + } + rcu_read_unlock(); + + if (!server) { + kfree(wt); + return -ESRCH; + } + + if (flags == WGCM_CTL_WORKER) { + WRITE_ONCE(tsk->wgcm_task, wt); + WRITE_ONCE(tsk->wgcm_server_task, server->wgcm_task); + wgcm_upd_workers_sum(tsk->wgcm_server_task, true); + current->flags |= PF_WGCM_WORKER; /* hook schedule() */ + } else { + WRITE_ONCE(tsk->wgcm_task, wt); + } + + return 0; +} + +static int wgcm_unregister(void) +{ + if (current->wgcm_server_task) + wgcm_upd_workers_sum(current->wgcm_server_task, false); + + wgcm_clear_task(current); + return 0; +} + +#define WGCM_CTL_CMD 0xff + +/** + * wgcm_ctl: (un)register the current task as a WGCM task. + * @flags: ORed values from enum umcg_ctl_flag; see below; + * @server_tid: server's(monitor's) thread id. + * + * @flags & WGCM_CTL_REGISTER: register a WGCM task: + * + * @flags & WGCM_CTL_UNREGISTER: unregister a WGCM task. + * + * WGCM workers: + * - @flags & WGCM_CTL_WORKER + * + * WGCM server: + * - !(@flags & WGCM_CTL_WORKER) + * + * All tasks: + * - server_tid must be valid(e.g. not zero). + * + * If the conditions above are met, wgcm_ctl() immediately returns + * if the registered task is a server. If the registered task is a + * worker, it's server's workers_sum will be added. Conversely, if + * the unregisted task is a worker, it's server's workers_sum will + * be decreased. + * + * Return: + * 0 - success + * -EFAULT - failed to read @self + * -EINVAL - some other error occurred + * -ESRCH - no such server_tid + */ +int wgcm_ctl(unsigned long flags, unsigned long server_tid) +{ + int cmd = flags & WGCM_CTL_CMD; + + flags &= ~WGCM_CTL_CMD; + + pr_err("[WGCM][PID:%d] wgcm_ctl success.cmd = %u, flags = %u", current->pid, cmd, flags); + + if (flags & ~WGCM_CTL_WORKER) { + return -EINVAL; + } + + switch (cmd) { + case WGCM_CTL_REGISTER: + return wgcm_register(flags, server_tid); + + case WGCM_CTL_UNREGISTER: + return wgcm_unregister(); + + default: + break; + } + + return -EINVAL; +} diff --git a/kernel/sys.c b/kernel/sys.c index c63de71889bfa77e40509fa85680cf9f8ef405d5..1433fb59a26501a5b4f9c8d8352eb3981de4792c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2588,6 +2588,18 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_WGCM_CTL: + pr_err("[PID:%d]arg2=%lu, arg3=%lu, arg4=%lu, arg5=%lu\n", + current->pid, arg2, arg3, arg4, arg5); + if (arg4 || arg5) + return -EINVAL; + + return wgcm_ctl(arg2, arg3); + case PR_GET_WGCM_TASK: + if (!arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return wgcm_get_taskinfo((struct wgcm_task __user *)arg2); case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break;