diff --git a/include/linux/sched.h b/include/linux/sched.h index d611b40fdef61db72ed55f5da05608e30166a0a5..d3198acc29af318b39ef85a111161ab0c99238b0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -575,6 +575,10 @@ struct sched_entity { unsigned long runnable_weight; #endif +#ifdef CONFIG_SCHED_LATENCY_NICE + int latency_weight; +#endif + #ifdef CONFIG_SMP /* * Per entity load average tracking. @@ -790,6 +794,9 @@ struct task_struct { int static_prio; int normal_prio; unsigned int rt_priority; +#ifdef CONFIG_SCHED_LATENCY_NICE + int latency_prio; +#endif struct sched_entity se; struct sched_rt_entity rt; @@ -1745,6 +1752,7 @@ extern struct pid *cad_pid; #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 +#define PF_FROZEN PF__HOLE__00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ab83d85e1183aa962e797e15e2d08fce6b976f84..7d64feafc408e10902391acb61e6e59320869759 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -11,9 +11,16 @@ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. */ -#define MAX_RT_PRIO 100 +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) @@ -26,6 +33,15 @@ #define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO) #define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + /* * Convert nice value [19,-20] to rlimit style value [1,40]. */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5a64582b086b2864c454642df8257ecde39d394a..980adc56cfe7553c314881cdc7a69dabb413ac69 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -12,6 +12,9 @@ extern unsigned long sysctl_hung_task_timeout_secs; enum { sysctl_hung_task_timeout_secs = 0 }; #endif +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_wakeup_granularity; + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab26ee78c37b425a76115aa21c20c06..b2e932c25be623d7b3d56998423c9e05964f81d2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { #define SCHED_FLAG_KEEP_PARAMS 0x10 #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 +#define SCHED_FLAG_LATENCY_NICE 0x80 #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ SCHED_FLAG_KEEP_PARAMS) @@ -143,6 +144,7 @@ struct clone_args { SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) + SCHED_FLAG_UTIL_CLAMP | \ + SCHED_FLAG_LATENCY_NICE) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index f2c4589d4dbfe0cee2a23abd796dad67c68f8002..db1e8199e8c8028f9c78d0b147f009dc647e3634 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -10,6 +10,7 @@ struct sched_param { #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ +#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ /* * Extended scheduling parameters data structure. @@ -98,6 +99,22 @@ struct sched_param { * scheduled on a CPU with no more capacity than the specified value. * * A task utilization boundary can be reset by setting the attribute to -1. + * + * Latency Tolerance Attributes + * =========================== + * + * A subset of sched_attr attributes allows to specify the relative latency + * requirements of a task with respect to the other tasks running/queued in the + * system. + * + * @ sched_latency_nice task's latency_nice value + * + * The latency_nice of a task can have any value in a range of + * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. + * + * A task with latency_nice with the value of LATENCY_NICE_MIN can be + * taken for a task requiring a lower latency as opposed to the task with + * higher latency_nice. */ struct sched_attr { __u32 size; @@ -120,6 +137,8 @@ struct sched_attr { __u32 sched_util_min; __u32 sched_util_max; + /* latency requirement hints */ + __s32 sched_latency_nice; }; #endif /* _UAPI_LINUX_SCHED_TYPES_H */ diff --git a/init/Kconfig b/init/Kconfig index 18fece8fe0857cfb5119eea34a137238357c0efd..879f6dc1cb274a9f92a03601106cb1d1e1d35765 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -849,6 +849,13 @@ config UCLAMP_BUCKETS_COUNT If in doubt, use the default value. +config SCHED_LATENCY_NICE + bool "Enable latency feature for FAIR tasks" + default n + help + This feature use latency nice priority to decide if a cfs task can + preempt the current running task. + endmenu # diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b1c01c0d99d605d5f0c22693470d3..fff9eca032623048e07cc766d4fa4e95bbaccf3f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -78,6 +78,9 @@ struct task_struct init_task .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20, +#ifdef CONFIG_SCHED_LATENCY_NICE + .latency_prio = NICE_WIDTH - 20, +#endif .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a854b71836dd5b8a811d098a0cc24a2381e1e727..461c49c8db244abb42f7f7b7bd11b423a8cd0087 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1328,6 +1328,49 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +#ifdef CONFIG_SCHED_LATENCY_NICE +static void set_latency_weight(struct task_struct *p) +{ + p->se.latency_weight = sched_latency_to_weight[p->latency_prio]; +} + +static void __setscheduler_latency(struct task_struct *p, + const struct sched_attr *attr) +{ + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { + p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice); + set_latency_weight(p); + } +} + +static int latency_nice_validate(struct task_struct *p, bool user, + const struct sched_attr *attr) +{ + if (attr->sched_latency_nice > MAX_LATENCY_NICE) + return -EINVAL; + if (attr->sched_latency_nice < MIN_LATENCY_NICE) + return -EINVAL; + /* Use the same security checks as NICE */ + if (user && attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) + && !capable(CAP_SYS_NICE)) + return -EPERM; + + return 0; +} +#else +static void +__setscheduler_latency(struct task_struct *p, const struct sched_attr *attr) +{ +} + +static inline +int latency_nice_validate(struct task_struct *p, bool user, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif + #ifdef CONFIG_UCLAMP_TASK /* * Serializes updates of utilization clamp values @@ -4744,6 +4787,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; +#ifdef CONFIG_SCHED_LATENCY_NICE + /* Propagate the parent's latency requirements to the child as well */ + p->latency_prio = current->latency_prio; +#endif + uclamp_fork(p); /* @@ -4760,6 +4808,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); +#ifdef CONFIG_SCHED_LATENCY_NICE + p->latency_prio = NICE_TO_LATENCY(0); + set_latency_weight(p); +#endif + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: @@ -7720,6 +7773,11 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; +#ifdef CONFIG_SCHED_LATENCY_NICE + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && + attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio)) + goto change; +#endif p->sched_reset_on_fork = reset_on_fork; retval = 0; @@ -8019,6 +8077,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a size < SCHED_ATTR_SIZE_VER1) return -EINVAL; +#ifdef CONFIG_SCHED_LATENCY_NICE + if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && + size < SCHED_ATTR_SIZE_VER2) + return -EINVAL; +#endif /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? @@ -8256,6 +8319,10 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; +#ifdef CONFIG_SCHED_LATENCY_NICE + kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio); +#endif + #ifdef CONFIG_UCLAMP_TASK /* * This could race with another potential updater, but this is fine @@ -11571,6 +11638,22 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +#ifdef CONFIG_SCHED_LATENCY_NICE +/* + * latency weight for wakeup preemption + */ +const int sched_latency_to_weight[40] = { + /* -20 */ 1024, 973, 922, 870, 819, + /* -15 */ 768, 717, 666, 614, 563, + /* -10 */ 512, 461, 410, 358, 307, + /* -5 */ 256, 205, 154, 102, 51, + /* 0 */ 0, -51, -102, -154, -205, + /* 5 */ -256, -307, -358, -410, -461, + /* 10 */ -512, -563, -614, -666, -717, + /* 15 */ -768, -819, -870, -922, -973, +}; +#endif + void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6326703f92aab707771c39921a543..83932b92dbb392d4e211b6af24292e84f3e1aa89 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,7 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include "sched.h" /* * This allows printing both to /proc/sched_debug and @@ -1086,6 +1087,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); +#ifdef CONFIG_SCHED_LATENCY_NICE + P(latency_prio); +#endif if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d336af9cba1341ba14a95d956045ad70c0ae3883..7c146d6ba72089721a1396d2e6f44ce872483b1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -57,6 +57,22 @@ #include "stats.h" #include "autogroup.h" +/* + * Targeted preemption latency for CPU-bound tasks: + * + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. + * + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ +unsigned int sysctl_sched_latency = 6000000ULL; +static unsigned int normalized_sysctl_sched_latency = 6000000ULL; + /* * The initial- and re-scaling of tunables is configurable * @@ -78,12 +94,29 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +/* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 8; + /* * After fork, child runs first. If set to 0 (default) then * parent will (try to) run first. */ unsigned int sysctl_sched_child_runs_first __read_mostly; +/* + * SCHED_OTHER wake-up granularity. + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -238,6 +271,8 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) SET_SYSCTL(sched_base_slice); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); #undef SET_SYSCTL } @@ -1003,6 +1038,8 @@ int sched_update_scaling(void) #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) WRT_SYSCTL(sched_base_slice); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); #undef WRT_SYSCTL return 0; @@ -5348,6 +5385,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@ -6590,6 +6630,37 @@ static int sched_idle_cpu(int cpu) } #endif +static void set_next_buddy(struct sched_entity *se); + +#ifdef CONFIG_SCHED_LATENCY_NICE +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se) +{ + struct sched_entity *next; + + if (se->latency_weight <= 0) + return; + + if (cfs->nr_running <= 1) + return; + /* + * When waking from idle, we don't need to check to preempt at wakeup + * the idle thread and don't set next buddy as a candidate for being + * picked in priority. + * In case of simultaneous wakeup from idle, the latency sensitive tasks + * lost opportunity to preempt non sensitive tasks which woke up + * simultaneously. + */ + + if (cfs->next) + next = cfs->next; + else + next = __pick_first_entity(cfs); + + if (next && wakeup_preempt_entity(next, se) == 1) + set_next_buddy(se); +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -6676,6 +6747,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) update_overutilized_status(rq); +#ifdef CONFIG_SCHED_LATENCY_NICE + if (rq->curr == rq->idle) + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se); +#endif + enqueue_throttle: assert_list_leaf_cfs_rq(rq); @@ -8126,6 +8202,93 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_LATENCY_NICE +static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se) +{ + int latency_weight = se->latency_weight; + long thresh = sysctl_sched_latency; + + /* + * A positive latency weigth means that the sched_entity has latency + * requirement that needs to be evaluated versus other entity. + * Otherwise, use the latency weight to evaluate how much scheduling + * delay is acceptable by se. + */ + if ((se->latency_weight > 0) || (curr->latency_weight > 0)) + latency_weight -= curr->latency_weight; + + if (!latency_weight) + return 0; + + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + /* + * Clamp the delta to stay in the scheduler period range + * [-sysctl_sched_latency:sysctl_sched_latency] + */ + latency_weight = clamp_t(long, latency_weight, + -1 * NICE_LATENCY_WEIGHT_MAX, + NICE_LATENCY_WEIGHT_MAX); + + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT; +} +#endif + +static unsigned long wakeup_gran(struct sched_entity *se) +{ + unsigned long gran = sysctl_sched_wakeup_granularity; + + /* + * Since its curr running now, convert the gran from real-time + * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. + */ + return calc_delta_fair(gran, se); +} + +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + +#ifdef CONFIG_SCHED_LATENCY_NICE + /* Take into account latency priority */ + vdiff += wakeup_latency_gran(curr, se); +#endif + + if (vdiff <= 0) + return -1; + + gran = wakeup_gran(se); + if (vdiff > gran) + return 1; + + return 0; +} + static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae4a09dd0f240957c0c7d749001a50..68a41a8013f12e645b2e3a33029d89c4dfd7bdc7 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,5 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + /* * Using the avg_vruntime, do the right thing and preserve lag across * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409cc00f20f24d8cc6456554d67aba0a..d6afa45f008a3ba4460c2dfbe2a9c9cc829ba5fa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -125,6 +126,37 @@ extern int sched_rr_timeslice; */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#ifdef CONFIG_SCHED_LATENCY_NICE +/* + * Latency nice is meant to provide scheduler hints about the relative + * latency requirements of a task with respect to other tasks. + * Thus a task with latency_nice == 19 can be hinted as the task with no + * latency requirements, in contrast to the task with latency_nice == -20 + * which should be given priority in terms of lower latency. + */ +#define MAX_LATENCY_NICE 19 +#define MIN_LATENCY_NICE -20 + +#define LATENCY_NICE_WIDTH \ + (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) + +/* + * Default tasks should be treated as a task with latency_nice = 0. + */ +#define DEFAULT_LATENCY_NICE 0 +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static latency [ 0..39 ], + * and back. + */ +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT) +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT) +#endif /* CONFIG_SCHED_LATENCY_NICE */ + /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of @@ -2181,6 +2213,9 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE); extern const int sched_prio_to_weight[40]; extern const u32 sched_prio_to_wmult[40]; +#ifdef CONFIG_SCHED_LATENCY_NICE +extern const int sched_latency_to_weight[40]; +#endif /* * {de,en}queue flags: