diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 8f1d6569564c4099deea31b1ebfd2371481ee36a..025a6c9dd622c44ff3be9e73399ae26de1347f31 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -180,9 +180,38 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_CPU_ISOLATION_OPT +static ssize_t isolate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t rc; + int cpuid = cpu->dev.id; + unsigned int isolated = cpu_isolated(cpuid); + + rc = sysfs_emit(buf, "%d\n", isolated); + + return rc; +} + +static DEVICE_ATTR_RO(isolate); + +static struct attribute *cpu_isolated_attrs[] = { + &dev_attr_isolate.attr, + NULL +}; + +static struct attribute_group cpu_isolated_attr_group = { + .attrs = cpu_isolated_attrs, +}; +#endif + static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_CPU_ISOLATION_OPT + &cpu_isolated_attr_group, #endif NULL }; @@ -190,6 +219,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = { static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_CPU_ISOLATION_OPT + &cpu_isolated_attr_group, #endif NULL }; @@ -220,6 +252,9 @@ static struct cpu_attr cpu_attrs[] = { _CPU_ATTR(online, &__cpu_online_mask), _CPU_ATTR(possible, &__cpu_possible_mask), _CPU_ATTR(present, &__cpu_present_mask), +#ifdef CONFIG_CPU_ISOLATION_OPT + _CPU_ATTR(core_ctl_isolated, &__cpu_isolated_mask), +#endif }; /* @@ -465,6 +500,9 @@ static struct attribute *cpu_root_attrs[] = { &cpu_attrs[0].attr.attr, &cpu_attrs[1].attr.attr, &cpu_attrs[2].attr.attr, +#ifdef CONFIG_CPU_ISOLATION_OPT + &cpu_attrs[3].attr.attr, +#endif &dev_attr_kernel_max.attr, &dev_attr_offline.attr, &dev_attr_isolated.attr, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8fb893ed205e3c77e82d39ae70e4984e226c55fd..dcd53762c4676de3355958d80fd2ad58da5f906d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -71,6 +71,9 @@ enum cpuhp_state { CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, +#ifdef CONFIG_SCHED_CORE_CTRL + CPUHP_CORE_CTL_ISOLATION_DEAD, +#endif CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f0d895d6ac39f1b1d8a45c54ec00eb66ec113224..eb5acbe17a566dae777dd5397705ab5ab8572bc7 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -55,6 +55,7 @@ extern unsigned int nr_cpu_ids; * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration + * cpu_isolated_mask- has bit 'cpu' set iff cpu isolated * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * @@ -96,6 +97,11 @@ extern struct cpumask __cpu_active_mask; #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#ifdef CONFIG_CPU_ISOLATION_OPT +extern struct cpumask __cpu_isolated_mask; +#define cpu_isolated_mask ((const struct cpumask *)&__cpu_isolated_mask) +#endif + extern atomic_t __num_online_cpus; #if NR_CPUS > 1 @@ -129,6 +135,22 @@ static inline unsigned int num_online_cpus(void) #define cpu_active(cpu) ((cpu) == 0) #endif +#if defined(CONFIG_CPU_ISOLATION_OPT) && NR_CPUS > 1 +#define num_isolated_cpus() cpumask_weight(cpu_isolated_mask) +#define num_online_uniso_cpus() \ +({ \ + cpumask_t mask; \ + \ + cpumask_andnot(&mask, cpu_online_mask, cpu_isolated_mask); \ + cpumask_weight(&mask); \ +}) +#define cpu_isolated(cpu) cpumask_test_cpu((cpu), cpu_isolated_mask) +#else /* !CONFIG_CPU_ISOLATION_OPT || NR_CPUS == 1 */ +#define num_isolated_cpus() 0U +#define num_online_uniso_cpus() num_online_cpus() +#define cpu_isolated(cpu) 0U +#endif + extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) @@ -811,6 +833,9 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#ifdef CONFIG_CPU_ISOLATION_OPT +#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask) +#endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); @@ -851,6 +876,16 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } +#ifdef CONFIG_CPU_ISOLATION_OPT +static inline void +set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, &__cpu_isolated_mask); + else + cpumask_clear_cpu(cpu, &__cpu_isolated_mask); +} +#endif /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7f1b8549ebcee19bec2c919f71c17b1d610108cd..f1c6982ef650d688a1aa9d9f2752ad874c75e3a0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -74,6 +74,7 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree + * 0x02 timer is pinned to a cpu * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free @@ -93,6 +94,8 @@ enum hrtimer_restart { */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_PINNED_SHIFT 1 +#define HRTIMER_STATE_PINNED (1 << HRTIMER_PINNED_SHIFT) /** * struct hrtimer - the basic hrtimer structure @@ -366,6 +369,12 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) #endif /* Exported timer functions: */ +#ifdef CONFIG_CPU_ISOLATION_OPT +/* To be used from cpusets, only */ +extern void hrtimer_quiesce_cpu(void *cpup); +#else +static inline void hrtimer_quiesce_cpu(void *cpup) { } +#endif /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 750c7f395ca907f772ad8e784e6df32e473ead72..6848b270f3660b1852ecc2baa091aee0e320b98b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -13,6 +13,11 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +#ifdef CONFIG_CPU_ISOLATION_OPT +extern void watchdog_enable(unsigned int cpu); +extern void watchdog_disable(unsigned int cpu); +extern bool watchdog_configured(unsigned int cpu); +#endif void lockup_detector_soft_poweroff(void); void lockup_detector_cleanup(void); bool is_hardlockup(void); @@ -37,6 +42,22 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } static inline void lockup_detector_cleanup(void) { } +#ifdef CONFIG_CPU_ISOLATION_OPT +static inline void watchdog_enable(unsigned int cpu) +{ +} +static inline void watchdog_disable(unsigned int cpu) +{ +} +static inline bool watchdog_configured(unsigned int cpu) +{ + /* + * Pretend the watchdog is always configured. + * We will be waiting for the watchdog to be enabled in core isolation + */ + return true; +} +#endif #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/include/linux/sched.h b/include/linux/sched.h index e4b281653f7c59d2612f14a6c8a607386833c8ba..44d5d8ed532adab5980ac9af1f0b34852de7c54d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -222,6 +222,41 @@ enum task_event { IRQ_UPDATE = 5, }; +#ifdef CONFIG_CPU_ISOLATION_OPT +extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); +extern int sched_isolate_cpu(int cpu); +extern int sched_unisolate_cpu(int cpu); +extern int sched_unisolate_cpu_unlocked(int cpu); +#else +static inline int sched_isolate_count(const cpumask_t *mask, + bool include_offline) +{ + cpumask_t count_mask; + + if (include_offline) + cpumask_andnot(&count_mask, mask, cpu_online_mask); + else + return 0; + + return cpumask_weight(&count_mask); +} + +static inline int sched_isolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu_unlocked(int cpu) +{ + return 0; +} +#endif + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/include/linux/sched/core_ctl.h b/include/linux/sched/core_ctl.h new file mode 100644 index 0000000000000000000000000000000000000000..ca321b7b0b08449b2952fca5554233f35ed087ca --- /dev/null +++ b/include/linux/sched/core_ctl.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016, 2019-2020, The Linux Foundation. All rights reserved. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTRL +extern void core_ctl_check(u64 wallclock); +#else +static inline void core_ctl_check(u64 wallclock) { } +#endif +#endif diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cc9f393e2a70603ed16974afeb60aa6569bcd26d..22420b45b39341959e5660f2b6a40ed64535966e 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -28,10 +28,25 @@ extern void __init housekeeping_init(void); #else +#ifdef CONFIG_CPU_ISOLATION_OPT +static inline int housekeeping_any_cpu(enum hk_flags flags) +{ + cpumask_t available; + int cpu; + + cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask); + cpu = cpumask_any(&available); + if (cpu >= nr_cpu_ids) + cpu = smp_processor_id(); + + return cpu; +} +#else static inline int housekeeping_any_cpu(enum hk_flags flags) { return smp_processor_id(); } +#endif static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { @@ -54,7 +69,11 @@ static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, flags); #endif +#ifdef CONFIG_CPU_ISOLATION_OPT + return !cpu_isolated(cpu); +#else return true; +#endif } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43ab5b48378eea1bbb0441bb8a520f..ca8b0d1ccf942b59ff316bb3ffc4433ade1eb140 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -21,6 +21,15 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_get_cpu_util(int cpu); +#else +static inline unsigned int sched_get_cpu_util(int cpu) +{ + return 0; +} +#endif + static inline int sched_info_on(void) { #ifdef CONFIG_SCHEDSTATS diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 63ea9aff368f0daee3f876f20e8a90f6c0fef9ba..57908e3449797b7ed07f6a35e1c0d6053f66a031 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -32,6 +32,9 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); +#ifdef CONFIG_CPU_ISOLATION_OPT +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +#endif void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); @@ -80,6 +83,14 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline int stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) + return stop_one_cpu(raw_smp_processor_id(), fn, arg); + return -ENOENT; +} + #endif /* CONFIG_SMP */ /* diff --git a/include/linux/timer.h b/include/linux/timer.h index d10bc7e73b41eff9a20900ae63f0c1538b71078b..f80f416bed14bd07384eb98775825fd7e9f3238d 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -180,6 +180,13 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +/* To be used from cpusets, only */ +#ifdef CONFIG_CPU_ISOLATION_OPT +extern void timer_quiesce_cpu(void *cpup); +#else +static inline void timer_quiesce_cpu(void *cpup) { } +#endif + extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6c562e0b59fee69072de6424f3d72..27b6ed3c9e58d4229c9b88461e1da34b68f4d77b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -6,6 +6,7 @@ #define _TRACE_SCHED_H #include +#include #include #include @@ -600,6 +601,165 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); +#ifdef CONFIG_SCHED_CORE_CTRL +TRACE_EVENT(core_ctl_eval_need, + + TP_PROTO(unsigned int cpu, unsigned int old_need, + unsigned int new_need, unsigned int updated), + TP_ARGS(cpu, old_need, new_need, updated), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, old_need) + __field(u32, new_need) + __field(u32, updated) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->old_need = old_need; + __entry->new_need = new_need; + __entry->updated = updated; + ), + TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu, + __entry->old_need, __entry->new_need, __entry->updated) +); + +TRACE_EVENT(core_ctl_set_busy, + + TP_PROTO(unsigned int cpu, unsigned int busy, + unsigned int old_is_busy, unsigned int is_busy, int high_irqload), + TP_ARGS(cpu, busy, old_is_busy, is_busy, high_irqload), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, busy) + __field(u32, old_is_busy) + __field(u32, is_busy) + __field(bool, high_irqload) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->busy = busy; + __entry->old_is_busy = old_is_busy; + __entry->is_busy = is_busy; + __entry->high_irqload = high_irqload; + ), + TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u high_irqload=%d", + __entry->cpu, __entry->busy, __entry->old_is_busy, + __entry->is_busy, __entry->high_irqload) +); + +TRACE_EVENT(core_ctl_set_boost, + + TP_PROTO(u32 refcount, s32 ret), + TP_ARGS(refcount, ret), + TP_STRUCT__entry( + __field(u32, refcount) + __field(s32, ret) + ), + TP_fast_assign( + __entry->refcount = refcount; + __entry->ret = ret; + ), + TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret) +); + +TRACE_EVENT(core_ctl_update_nr_need, + + TP_PROTO(int cpu, int nr_need, int prev_misfit_need, + int nrrun, int max_nr, int nr_prev_assist), + + TP_ARGS(cpu, nr_need, prev_misfit_need, nrrun, max_nr, nr_prev_assist), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, nr_need) + __field(int, prev_misfit_need) + __field(int, nrrun) + __field(int, max_nr) + __field(int, nr_prev_assist) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_need = nr_need; + __entry->prev_misfit_need = prev_misfit_need; + __entry->nrrun = nrrun; + __entry->max_nr = max_nr; + __entry->nr_prev_assist = nr_prev_assist; + ), + + TP_printk("cpu=%d nr_need=%d prev_misfit_need=%d nrrun=%d max_nr=%d nr_prev_assist=%d", + __entry->cpu, __entry->nr_need, __entry->prev_misfit_need, + __entry->nrrun, __entry->max_nr, __entry->nr_prev_assist) +); +#endif + +#ifdef CONFIG_SCHED_RUNNING_AVG +/* + * Tracepoint for sched_get_nr_running_avg + */ +TRACE_EVENT(sched_get_nr_running_avg, + + TP_PROTO(int cpu, int nr, int nr_misfit, int nr_max), + + TP_ARGS(cpu, nr, nr_misfit, nr_max), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, nr) + __field(int, nr_misfit) + __field(int, nr_max) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr = nr; + __entry->nr_misfit = nr_misfit; + __entry->nr_max = nr_max; + ), + + TP_printk("cpu=%d nr=%d nr_misfit=%d nr_max=%d", + __entry->cpu, __entry->nr, __entry->nr_misfit, __entry->nr_max) +); +#endif + +#ifdef CONFIG_CPU_ISOLATION_OPT +/* + * sched_isolate - called when cores are isolated/unisolated + * + * @acutal_mask: mask of cores actually isolated/unisolated + * @req_mask: mask of cores requested isolated/unisolated + * @online_mask: cpu online mask + * @time: amount of time in us it took to isolate/unisolate + * @isolate: 1 if isolating, 0 if unisolating + * + */ +TRACE_EVENT(sched_isolate, + + TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus, + u64 start_time, unsigned char isolate), + + TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate), + + TP_STRUCT__entry( + __field(u32, requested_cpu) + __field(u32, isolated_cpus) + __field(u32, time) + __field(unsigned char, isolate) + ), + + TP_fast_assign( + __entry->requested_cpu = requested_cpu; + __entry->isolated_cpus = isolated_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->isolate = isolate; + ), + + TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d", + __entry->requested_cpu, __entry->isolated_cpus, + __entry->time, __entry->isolate) +); +#endif + /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. diff --git a/init/Kconfig b/init/Kconfig index 8b20edacf9217c2ced62e63344c5c338e03e4832..1512479e7782a5d3ffec8fe9609a4a3175c7e579 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -658,6 +658,32 @@ config CPU_ISOLATION Say Y if unsure. +config SCHED_RUNNING_AVG + bool "per-rq and per-cluster running average statistics" + default n + +config CPU_ISOLATION_OPT + bool "CPU isolation optimization" + depends on SMP + default n + help + This option enables cpu isolation optimization, which allows + to isolate cpu dynamically. The isolated cpu will be unavailable + to scheduler and load balancer, and all its non-pinned timers, + IRQs and tasks will be migrated to other cpus, only pinned + kthread and IRQS are still allowed to run, this achieves + similar effect as hotplug but at lower latency cost. + +config SCHED_CORE_CTRL + bool "Core control" + depends on CPU_ISOLATION_OPT + select SCHED_RUNNING_AVG + default n + help + This option enables the core control functionality in + the scheduler. Core control automatically isolate and + unisolate cores based on cpu load and utilization. + source "kernel/rcu/Kconfig" config BUILD_BIN2C diff --git a/kernel/cpu.c b/kernel/cpu.c index 67c22941b5f275e05fe4e2de549e427f1435b96f..d33629370cf1a4166e388306435ddc43727c7d71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1052,6 +1052,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; +#ifdef CONFIG_CPU_ISOLATION_OPT + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; +#endif + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -2495,6 +2500,11 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +#ifdef CONFIG_CPU_ISOLATION_OPT +struct cpumask __cpu_isolated_mask __read_mostly; +EXPORT_SYMBOL(__cpu_isolated_mask); +#endif + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); @@ -2513,6 +2523,13 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +#ifdef CONFIG_CPU_ISOLATION_OPT +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(&__cpu_isolated_mask, src); +} +#endif + void set_cpu_online(unsigned int cpu, bool online) { /* diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 02236b13b359974e95a2e3ca4ec3f0cadfd0962a..47e017cf33ce6335fe5e8315efb5e89121a7ab13 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "internals.h" @@ -58,6 +59,9 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity; bool brokeaff = false; int err; +#ifdef CONFIG_CPU_ISOLATION_OPT + struct cpumask available_cpus; +#endif /* * IRQ chip might be already torn down, but the irq descriptor is @@ -110,7 +114,17 @@ static bool migrate_one_irq(struct irq_desc *desc) if (maskchip && chip->irq_mask) chip->irq_mask(d); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; +#endif + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { +#ifdef CONFIG_CPU_ISOLATION_OPT + const struct cpumask *default_affinity; +#endif + /* * If the interrupt is managed, then shut it down and leave * the affinity untouched. @@ -120,7 +134,38 @@ static bool migrate_one_irq(struct irq_desc *desc) irq_shutdown_and_deactivate(desc); return false; } + +#ifdef CONFIG_CPU_ISOLATION_OPT + default_affinity = desc->affinity_hint ? : irq_default_affinity; + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_intersects(&available_cpus, default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + default_affinity); + else if (cpumask_empty(&available_cpus)) + affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); +#else affinity = cpu_online_mask; +#endif brokeaff = true; } /* @@ -129,7 +174,11 @@ static bool migrate_one_irq(struct irq_desc *desc) * mask and therefore might keep/reassign the irq to the outgoing * CPU. */ +#ifdef CONFIG_CPU_ISOLATION_OPT + err = irq_set_affinity_locked(d, affinity, false); +#else err = irq_do_set_affinity(d, affinity, false); +#endif if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc664be5d3a4a1ac83ec3d365c6370..5613e4a3f2e2afb25179ce79c3116a70f7e416d9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -154,6 +154,12 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (err) goto free_cpumask; +#ifdef CONFIG_CPU_ISOLATION_OPT + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } +#endif /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 75ab238bde9d7f1e0b165dd44449b9e9ac382fd6..0e3173ee99fb8bea53d15d6081457a02a647394a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,5 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o +obj-$(CONFIG_SCHED_CORE_CTRL) += core_ctl.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aed3b931e6708263a8a2c49e140b479bb9e6fe0a..33e19cbd4eee78ad482faa46b9ef9791970e1d29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include @@ -1893,6 +1895,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq_flags rf; struct rq *rq; int ret = 0; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t allowed_mask; +#endif rq = task_rq_lock(p, &rf); update_rq_clock(rq); @@ -1916,6 +1921,20 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask); + + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + cpumask_and(&allowed_mask, cpu_valid_mask, new_mask); + dest_cpu = cpumask_any(&allowed_mask); + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + ret = -EINVAL; + goto out; + } + } +#else /* * Picking a ~random cpu helps in cases where we are changing affinity * for groups of tasks (ie. cpuset), so that load balancing is not @@ -1926,6 +1945,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ret = -EINVAL; goto out; } +#endif do_set_cpus_allowed(p, new_mask); @@ -1940,8 +1960,13 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, } /* Can the task run on the task's current CPU? If so, we're done */ +#ifdef CONFIG_CPU_ISOLATION_OPT + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) + goto out; +#else if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; +#endif if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; @@ -2293,12 +2318,19 @@ EXPORT_SYMBOL_GPL(kick_process); * select_task_rq() below may allow selection of !active CPUs in order * to satisfy the above rules. */ +#ifdef CONFIG_CPU_ISOLATION_OPT +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) +#else static int select_fallback_rq(int cpu, struct task_struct *p) +#endif { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; + enum { cpuset, possible, fail, bug } state = cpuset; int dest_cpu; +#ifdef CONFIG_CPU_ISOLATION_OPT + int isolated_candidate = -1; +#endif /* * If the node that the CPU is on has been offlined, cpu_to_node() @@ -2312,6 +2344,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } @@ -2322,7 +2356,18 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; +#ifdef CONFIG_CPU_ISOLATION_OPT + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; +#endif goto out; } @@ -2341,6 +2386,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) break; case fail: +#ifdef CONFIG_CPU_ISOLATION_OPT + allow_iso = true; + state = bug; + break; +#else + /* fall through; */ +#endif + + case bug: BUG(); break; } @@ -2368,6 +2422,10 @@ static int select_fallback_rq(int cpu, struct task_struct *p) static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { +#ifdef CONFIG_CPU_ISOLATION_OPT + bool allow_isolated = (p->flags & PF_KTHREAD); +#endif + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -2385,8 +2443,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ +#ifdef CONFIG_CPU_ISOLATION_OPT + if (unlikely(!is_cpu_allowed(p, cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); +#else if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); +#endif return cpu; } @@ -3939,7 +4003,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5936,6 +6000,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; +#ifdef CONFIG_CPU_ISOLATION_OPT + int dest_cpu; + cpumask_t allowed_mask; +#endif rcu_read_lock(); @@ -5997,20 +6065,30 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); + if (dest_cpu < nr_cpu_ids) { +#endif + retval = __set_cpus_allowed_ptr(p, new_mask, true); + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } } +#ifdef CONFIG_CPU_ISOLATION_OPT + } else { + retval = -EINVAL; } +#endif + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -6074,6 +6152,16 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + +#ifdef CONFIG_CPU_ISOLATION_OPT + /* The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); +#endif + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -6761,20 +6849,77 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) BUG(); } +#ifdef CONFIG_CPU_ISOLATION_OPT +/* + * Remove a task from the runqueue and pretend that it's migrating. This + * should prevent migrations for the detached task and disallow further + * changes to tsk_cpus_allowed. + */ +static void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ + lockdep_assert_held(&rq->lock); + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(rq, p, 0); + list_add(&p->se.group_node, tasks); +} + +static void attach_tasks_core(struct list_head *tasks, struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_held(&rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + BUG_ON(task_rq(p) != rq); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } +} + +#else + +static void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ +} + +static void attach_tasks_core(struct list_head *tasks, struct rq *rq) +{ +} + +#endif /* CONFIG_CPU_ISOLATION_OPT */ + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; struct rq_flags orf = *rf; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + LIST_HEAD(tasks); + cpumask_t avail_cpus; + +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); +#else + cpumask_copy(&avail_cpus, cpu_online_mask); +#endif /* * Fudge the rq selection such that the below task selection loop @@ -6797,13 +6942,20 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) for (;;) { /* * There's this thread running, bail when that's the only - * remaining thread: + * remaining thread. */ if (rq->nr_running == 1) break; next = __pick_migrate_task(rq); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_mask)) { + detach_one_task_core(next, rq, &tasks); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -6816,31 +6968,278 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); /* * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. + * However, during cpu isolation the load balancer might have + * interferred since we don't stop all CPUs. Ignore warning for + * this case. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + WARN_ON(migrate_pinned_tasks); raw_spin_unlock(&next->pi_lock); continue; } /* Find suitable destination for @next, with force if needed. */ +#ifdef CONFIG_CPU_ISOLATION_OPT + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); +#else dest_cpu = select_fallback_rq(dead_rq->cpu, next); +#endif rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { rq_unlock(rq, rf); rq = dead_rq; *rf = orf; rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop; + + if (num_pinned_kthreads > 1) + attach_tasks_core(&tasks, rq); } + +#ifdef CONFIG_CPU_ISOLATION_OPT +int do_isolation_work_cpu_stop(void *data) +{ + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + watchdog_disable(cpu); + + local_irq_disable(); + + irq_migrate_all_off_this_cpu(); + + flush_smp_call_function_from_idle(); + + /* Update our root-domain */ + rq_lock(rq, &rf); + + /* + * Temporarily mark the rq as offline. This will allow us to + * move tasks off the CPU. + */ + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, &rf, false); + + if (rq->rd) + set_rq_online(rq); + rq_unlock(rq, &rf); + + local_irq_enable(); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq; + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + cpu_maps_update_begin(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) || + !cpu_online(cpu) || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + rq = cpu_rq(cpu); + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + --cpu_isolation_vote[cpu]; + ret_code = -EINVAL; + goto out; + } + + /* + * There is a race between watchdog being enabled by hotplug and + * core isolation disabling the watchdog. When a CPU is hotplugged in + * and the hotplug lock has been released the watchdog thread might + * not have run yet to enable the watchdog. + * We have to wait for the watchdog to be enabled before proceeding. + */ + if (!watchdog_configured(cpu)) { + msleep(20); + if (!watchdog_configured(cpu)) { + --cpu_isolation_vote[cpu]; + ret_code = -EBUSY; + goto out; + } + } + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + watchdog_disable(cpu); + irq_lock_sparse(); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + irq_unlock_sparse(); + + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + cpu_maps_update_done(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + u64 start_time = 0; + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) + || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + cpu_maps_update_begin(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + cpu_maps_update_done(); + return ret_code; +} + +#endif /* CONFIG_CPU_ISOLATION_OPT */ + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -7028,7 +7427,7 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); + migrate_tasks(rq, &rf, true); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 0000000000000000000000000000000000000000..eef1d69211782dd890a4aab03788421adf64e80f --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2014-2021, The Linux Foundation. All rights reserved. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sched.h" +#include "walt.h" + +#define MAX_CPUS_PER_CLUSTER 6 +#define MAX_CLUSTERS 3 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + unsigned int nr_not_preferred_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + unsigned int nr_prev_assist; + unsigned int nr_prev_assist_thresh; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool enable; + int nrrun; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +ATOMIC_NOTIFIER_HEAD(core_ctl_notifier); +static unsigned int last_nr_big; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->max_cpus); +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += sysfs_emit_at(buf, count, + "CPU%u\n", cpu); + count += sysfs_emit_at(buf, count, + "\tCPU: %u\n", c->cpu); + count += sysfs_emit_at(buf, count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += sysfs_emit_at(buf, count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += sysfs_emit_at(buf, count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += sysfs_emit_at(buf, count, + "\tBusy%%: %u\n", c->busy); + count += sysfs_emit_at(buf, count, + "\tIs busy: %u\n", c->is_busy); + count += sysfs_emit_at(buf, count, + "\tNot preferred: %u\n", + c->not_preferred); + count += sysfs_emit_at(buf, count, + "\tNr running: %u\n", cluster->nrrun); + count += sysfs_emit_at(buf, count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += sysfs_emit_at(buf, count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += sysfs_emit_at(buf, count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += sysfs_emit_at(buf, count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +static struct sched_avg_stats nr_stats[NR_CPUS]; + +/* + * nr_need: + * Number of tasks running on this cluster plus + * tasks running on higher capacity clusters. + * To find out CPUs needed from this cluster. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 4 small tasks running on min capacity CPUs + * and 2 big tasks running on 2 max capacity + * CPUs, nr_need has to be 6 for min capacity + * cluster and 2 for max capacity cluster. + * This is because, min capacity cluster has to + * account for tasks running on max capacity + * cluster, so that, the min capacity cluster + * can be ready to accommodate tasks running on max + * capacity CPUs if the demand of tasks goes down. + */ +static int compute_cluster_nr_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_need = 0; + + for_each_cluster(cluster, index) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_need += nr_stats[cpu].nr; + } + + return nr_need; +} + +/* + * prev_misfit_need: + * Tasks running on smaller capacity cluster which + * needs to be migrated to higher capacity cluster. + * To find out how many tasks need higher capacity CPUs. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 2 small tasks and 2 big tasks running on + * min capacity CPUs and no tasks running on + * max cpacity, prev_misfit_need of min capacity + * cluster will be 0 and prev_misfit_need of + * max capacity cluster will be 2. + */ +static int compute_prev_cluster_misfit_need(int index) +{ + int cpu; + struct cluster_data *prev_cluster; + int prev_misfit_need = 0; + + /* + * Lowest capacity cluster does not have to + * accommodate any misfit tasks. + */ + if (index == 0) + return 0; + + prev_cluster = &cluster_state[index - 1]; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + prev_misfit_need += nr_stats[cpu].nr_misfit; + + return prev_misfit_need; +} + +static int compute_cluster_max_nr(int index) +{ + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + int max_nr = 0; + + for_each_cpu(cpu, &cluster->cpu_mask) + max_nr = max(max_nr, nr_stats[cpu].nr_max); + + return max_nr; +} + +static int cluster_real_big_tasks(int index) +{ + int nr_big = 0; + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + + if (index == 0) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr_misfit; + } else { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr; + } + + return nr_big; +} + +/* + * prev_nr_need_assist: + * Tasks that are eligible to run on the previous + * cluster but cannot run because of insufficient + * CPUs there. prev_nr_need_assist is indicative + * of number of CPUs in this cluster that should + * assist its previous cluster to makeup for + * insufficient CPUs there. + * + * For example: + * On tri-cluster system with 4 min capacity + * CPUs, 3 intermediate capacity CPUs and 1 + * max capacity CPU, if there are 4 small + * tasks running on min capacity CPUs, 4 big + * tasks running on intermediate capacity CPUs + * and no tasks running on max capacity CPU, + * prev_nr_need_assist for min & max capacity + * clusters will be 0, but, for intermediate + * capacity cluster prev_nr_need_assist will + * be 1 as it has 3 CPUs, but, there are 4 big + * tasks to be served. + */ +static int prev_cluster_nr_need_assist(int index) +{ + int need = 0; + int cpu; + struct cluster_data *prev_cluster; + + if (index == 0) + return 0; + + index--; + prev_cluster = &cluster_state[index]; + + /* + * Next cluster should not assist, while there are isolated cpus + * in this cluster. + */ + if (prev_cluster->nr_isolated_cpus) + return 0; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + need += nr_stats[cpu].nr; + + need += compute_prev_cluster_misfit_need(index); + + if (need > prev_cluster->active_cpus) + need = need - prev_cluster->active_cpus; + else + need = 0; + + return need; +} + +static void update_running_avg(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + int big_avg = 0; + + sched_get_nr_running_avg(nr_stats); + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + int nr_need, prev_misfit_need; + + if (!cluster->inited) + continue; + + nr_need = compute_cluster_nr_need(index); + prev_misfit_need = compute_prev_cluster_misfit_need(index); + + + cluster->nrrun = nr_need + prev_misfit_need; + cluster->max_nr = compute_cluster_max_nr(index); + cluster->nr_prev_assist = prev_cluster_nr_need_assist(index); + trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need, + prev_misfit_need, + cluster->nrrun, cluster->max_nr, + cluster->nr_prev_assist); + big_avg += cluster_real_big_tasks(index); + } + spin_unlock_irqrestore(&state_lock, flags); + + last_nr_big = big_avg; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* + * unisolate as many cores as the previous cluster + * needs assistance with. + */ + if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh) + new_need = new_need + cluster->nr_prev_assist; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + bool old_is_busy = c->is_busy; + int high_irqload = sched_cpu_high_irqload(c->cpu); + + if (c->busy >= cluster->busy_up_thres[thres_idx] || + high_irqload) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy, + c->is_busy, high_irqload); + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + /* + * When there is no change in need and there are no more + * active CPUs than currently needed, just update the + * need time stamp and return. + */ + if (new_need == last_need && new_need == cluster->active_cpus) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster = NULL; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + ret = -EINVAL; + break; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) { + index = 0; + for_each_cluster(cluster, index) + apply_need(cluster); + } + + if (cluster) + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_check(u64 window_start) +{ + int cpu; + struct cpu_data *c; + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + + if (unlikely(!initialized)) + return; + + if (window_start == core_ctl_check_timestamp) + return; + + core_ctl_check_timestamp = window_start; + + spin_lock_irqsave(&state_lock, flags); + for_each_possible_cpu(cpu) { + + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + + if (!cluster || !cluster->inited) + continue; + + c->busy = sched_get_cpu_util(cpu); + } + spin_unlock_irqrestore(&state_lock, flags); + + update_running_avg(); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + bool first_pass = cluster->nr_not_preferred_cpus; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't isolate busy CPUs. */ + if (c->is_busy) + continue; + + /* + * We isolate only the not_preferred CPUs. If none + * of the CPUs are selected as not_preferred, then + * all CPUs are eligible for isolation. + */ + if (cluster->nr_not_preferred_cpus && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +again: + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + if (first_pass && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + if (first_pass && cluster->active_cpus > cluster->max_cpus) { + first_pass = false; + goto again; + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int isolation_cpuhp_state(unsigned int cpu, bool online) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup = false, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return 0; + + if (online) { + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + } else { + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return 0; +} + +static int core_ctl_isolation_online_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, true); +} + +static int core_ctl_isolation_dead_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, false); +} + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nr_prev_assist_thresh = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + cluster->nr_not_preferred_cpus = 0; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int __init core_ctl_init(void) +{ + struct sched_cluster *cluster; + int ret; + + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "core_ctl/isolation:online", + core_ctl_isolation_online_cpu, NULL); + + cpuhp_setup_state_nocalls(CPUHP_CORE_CTL_ISOLATION_DEAD, + "core_ctl/isolation:dead", + NULL, core_ctl_isolation_dead_cpu); + + for_each_sched_cluster(cluster) { + ret = cluster_init(&cluster->cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + } + + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100644 index 0000000000000000000000000000000000000000..0be55ac6a526ba8c9ad246e1b674e4f26c17728a --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016, 2019-2020, The Linux Foundation. All rights reserved. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTRL +void core_ctl_check(u64 wallclock); +int core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +#endif +#endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a07976292b2a8daf2310451abf3e31fa6..9d286972ed7a0b2c8c703c7ca81382d9148990da 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -79,6 +79,9 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(lowest_mask, lowest_mask, cpu_isolated_mask); +#endif /* * We have to ensure that we have at least one bit * still set in the array, since the map could have diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f30bd5d6d655cc09dd1023897ea5d9b5b2988022..42d51caa611cdb1121ee81471c510972f688c36d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5978,6 +5978,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (cpu_isolated(i)) + continue; + if (sched_idle_cpu(i)) return i; @@ -6138,6 +6141,9 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int return -1; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(cpus, cpus, cpu_isolated_mask); +#endif for_each_cpu_wrap(core, cpus, target) { bool idle = true; @@ -6176,6 +6182,8 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t if (!cpumask_test_cpu(cpu, p->cpus_ptr) || !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; + if (cpu_isolated(cpu)) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -6240,6 +6248,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; + if (cpu_isolated(cpu)) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } @@ -6270,6 +6280,9 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); + if (cpu_isolated(cpu)) + continue; + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; if (fits_capacity(task_util, cpu_cap)) @@ -6311,15 +6324,15 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + !cpu_isolated(target) && asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + ((available_idle_cpu(prev) || sched_idle_cpu(prev)) && + !cpu_isolated(target) && asym_fits_capacity(task_util, prev))) return prev; /* @@ -8301,6 +8314,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) for_each_cpu(cpu, sched_group_span(sdg)) { unsigned long cpu_cap = capacity_of(cpu); + if (cpu_isolated(cpu)) + continue; + capacity += cpu_cap; min_capacity = min(cpu_cap, min_capacity); max_capacity = max(cpu_cap, max_capacity); @@ -8314,10 +8330,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { struct sched_group_capacity *sgc = group->sgc; - - capacity += sgc->capacity; - min_capacity = min(sgc->min_capacity, min_capacity); - max_capacity = max(sgc->max_capacity, max_capacity); + __maybe_unused cpumask_t *cpus = + sched_group_span(group); + + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, + min_capacity); + max_capacity = max(sgc->max_capacity, + max_capacity); + } group = group->next; } while (group != child->groups); } @@ -8525,6 +8547,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + if (cpu_isolated(i)) + continue; + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; @@ -8566,6 +8591,15 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_type = group_has_spare; + sgs->group_weight = group->group_weight; + return; + } + /* Check if dst CPU is idle and preferred to this group */ if (env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && @@ -8911,6 +8945,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .avg_load = UINT_MAX, .group_type = group_overloaded, }; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t allowed_cpus; + + cpumask_andnot(&allowed_cpus, p->cpus_ptr, cpu_isolated_mask); +#endif imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; @@ -8919,8 +8958,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; /* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_CPU_ISOLATION_OPT + if (!cpumask_intersects(sched_group_span(group), + &allowed_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), p->cpus_ptr)) +#endif continue; local_group = cpumask_test_cpu(this_cpu, @@ -9485,6 +9529,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + if (cpu_isolated(i)) + continue; + capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; @@ -9628,6 +9675,17 @@ static int need_active_balance(struct lb_env *env) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +#ifdef CONFIG_CPU_ISOLATION_OPT +int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_span(sg), group_balance_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} +#endif + static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9651,7 +9709,7 @@ static int should_we_balance(struct lb_env *env) /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) || cpu_isolated(cpu)) continue; /* Are we the first idle CPU? */ @@ -9659,7 +9717,7 @@ static int should_we_balance(struct lb_env *env) } /* Are we the first CPU of this group ? */ - return group_balance_cpu(sg) == env->dst_cpu; + return group_balance_cpu_not_isolated(sg) == env->dst_cpu; } /* @@ -9861,7 +9919,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -10075,7 +10134,17 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + unsigned int available_cpus; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t avail_mask; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); +#else + available_cpus = num_online_cpus(); +#endif + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -10205,6 +10274,9 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + if (cpu_isolated(ilb)) + continue; + if (idle_cpu(ilb)) return ilb; } @@ -10259,6 +10331,7 @@ static void nohz_balancer_kick(struct rq *rq) struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsigned int flags = 0; + cpumask_t cpumask; if (unlikely(rq->idle_balance)) return; @@ -10273,8 +10346,15 @@ static void nohz_balancer_kick(struct rq *rq) * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask); + if (cpumask_empty(&cpumask)) + return; +#else + cpumask_copy(&cpumask, nohz.idle_cpus_mask); if (likely(!atomic_read(&nohz.nr_cpus))) return; +#endif if (READ_ONCE(nohz.has_blocked) && time_after(now, READ_ONCE(nohz.next_blocked))) @@ -10310,7 +10390,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. */ - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + for_each_cpu_and(i, sched_domain_span(sd), &cpumask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; @@ -10488,6 +10568,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int balance_cpu; int ret = false; struct rq *rq; + cpumask_t cpus; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10507,7 +10588,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); +#else + cpumask_copy(&cpus, nohz.idle_cpus_mask); +#endif + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -10658,6 +10745,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we @@ -10771,6 +10861,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + /* + * Since core isolation doesn't update nohz.idle_cpus_mask, there + * is a possibility this nohz kicked cpu could be isolated. Hence + * return if the cpu is isolated. + */ + if (cpu_isolated(this_rq->cpu)) + return; + /* * If this CPU has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle CPUs whose ticks are @@ -10792,8 +10890,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5938cf2e421b53c05de4c484432c3d9b7388c2bd..6c1475950441419660a5d322250bedf868b2036f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -2279,7 +2283,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; rt_queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6fd06c16ee24ac3cfab662565e38fae346648c4e..22ff400d5b08c7088783e2e4eea3598dc6d468c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -140,6 +140,10 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +#ifdef CONFIG_SMP +extern void init_sched_groups_capacity(int cpu, struct sched_domain *sd); +#endif + extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution @@ -2957,6 +2961,11 @@ static inline unsigned long cpu_util_freq_walt(int cpu) return (util >= capacity) ? capacity : util; } + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} #else /* CONFIG_SCHED_WALT */ static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) { } @@ -2972,4 +2981,35 @@ static inline int is_reserved(int cpu) } static inline void clear_reserved(int cpu) { } + +static inline bool hmp_capable(void) +{ + return false; +} #endif /* CONFIG_SCHED_WALT */ + +struct sched_avg_stats { + int nr; + int nr_misfit; + int nr_max; + int nr_scaled; +}; +#ifdef CONFIG_SCHED_RUNNING_AVG +extern void sched_get_nr_running_avg(struct sched_avg_stats *stats); +#else +static inline void sched_get_nr_running_avg(struct sched_avg_stats *stats) { } +#endif + +#ifdef CONFIG_CPU_ISOLATION_OPT +extern int group_balance_cpu_not_isolated(struct sched_group *sg); +#else +static inline int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + return group_balance_cpu(sg); +} +#endif /* CONFIG_CPU_ISOLATION_OPT */ + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks); +#endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 0000000000000000000000000000000000000000..d74579a1553db6f6545b0e02bcc7b21d9e5da312 --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved. + */ +/* + * Scheduler hook for average runqueue determination + */ +#include +#include +#include +#include +#include + +#include "sched.h" +#include "walt.h" +#include + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define NR_THRESHOLD_PCT 15 + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(struct sched_avg_stats *stats) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 period = curr_time - last_get_time; + u64 tmp_nr, tmp_misfit; + + if (!period) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + u64 diff; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_nr = per_cpu(nr_prod_sum, cpu); + tmp_nr += per_cpu(nr, cpu) * diff; + tmp_nr = div64_u64((tmp_nr * 100), period); + + tmp_misfit = per_cpu(nr_big_prod_sum, cpu); + tmp_misfit = div64_u64((tmp_misfit * 100), period); + + /* + * NR_THRESHOLD_PCT is to make sure that the task ran + * at least 85% in the last window to compensate any + * over estimating being done. + */ + stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), + 100); + stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + + NR_THRESHOLD_PCT), 100); + stats[cpu].nr_max = per_cpu(nr_max, cpu); + + trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, + stats[cpu].nr_misfit, stats[cpu].nr_max); + + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + last_get_time = curr_time; + +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > + capacity_orig_of(cpu)) + load_trigger = true; + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +/* + * Returns the CPU utilization % in the last window. + * + */ +unsigned int sched_get_cpu_util(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 util; + unsigned long capacity, flags; + unsigned int busy; + + raw_spin_lock_irqsave(&rq->lock, flags); + + util = rq->cfs.avg.util_avg; + capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = rq->prev_runnable_sum; + util = div64_u64(util, + sched_ravg_window >> SCHED_CAPACITY_SHIFT); + } +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + util = (util >= capacity) ? capacity : util; + busy = div64_ul((util * 100), capacity); + return busy; +} + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 004e9505f7ad6e2f7eb8d7e688831c57772784a6..b30b62f0d683b45b59b213fa481ede662e25ac55 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1220,16 +1220,25 @@ build_sched_groups(struct sched_domain *sd, int cpu) * group having more cpu_capacity will pickup more load compared to the * group having less cpu_capacity. */ -static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) +void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_t avail_mask; +#endif WARN_ON(!sg); do { int cpu, max_cpu = -1; +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&avail_mask, sched_group_span(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); +#else sg->group_weight = cpumask_weight(sched_group_span(sg)); +#endif if (!(sd->flags & SD_ASYM_PACKING)) goto next; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 753b852ab340279d6f500752be1b6eb3a3fd36a2..30db3d6179140100099a9e275b7f900bc2a999e3 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -23,6 +23,7 @@ #include #include "sched.h" #include "walt.h" +#include "core_ctl.h" #define CREATE_TRACE_POINTS #include #undef CREATE_TRACE_POINTS @@ -1681,6 +1682,9 @@ void walt_irq_work(struct irq_work *irq_work) for_each_cpu(cpu, cpu_possible_mask) raw_spin_unlock(&cpu_rq(cpu)->lock); + + if (!is_migration) + core_ctl_check(this_rq()->window_start); } static void walt_init_once(void) diff --git a/kernel/smp.c b/kernel/smp.c index f73a597c8e4cf44bab200d61323e8eb2ca6e5e4f..92742aa1e3480e85830acb62421ba3d73ce0c03a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -957,7 +957,8 @@ void wake_up_all_idle_cpus(void) if (cpu == smp_processor_id()) continue; - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 890b79cf0e7c311c94e1ce5aa882a5f7f7d4e335..3e67402079bbeecd161dd4e237e79dcbba524505 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,11 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ +#ifdef CONFIG_CPU_ISOLATION_OPT +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +#else static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +#endif { int ret; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ef90718c1146dfe0f875650bd3750dc9523b8aa..b8835ac5e31fe9f3d4a29de8c38ba63e6235837f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -984,7 +984,11 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ +#ifdef CONFIG_CPU_ISOLATION_OPT + WRITE_ONCE(timer->state, (timer->state | HRTIMER_STATE_ENQUEUED)); +#else WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); +#endif return timerqueue_add(&base->active, &timer->node); } @@ -1007,7 +1011,15 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ +#ifdef CONFIG_CPU_ISOLATION_OPT + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + WRITE_ONCE(timer->state, newstate | (timer->state & HRTIMER_STATE_PINNED)); +#else WRITE_ONCE(timer->state, newstate); +#endif if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -1061,6 +1073,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); +#ifdef CONFIG_CPU_ISOLATION_OPT + /* Make sure PINNED flag is cleared after removing hrtimer */ + timer->state &= ~HRTIMER_STATE_PINNED; +#endif return 1; } return 0; @@ -1153,6 +1169,12 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, new_base = base; } +#ifdef CONFIG_CPU_ISOLATION_OPT + timer->state &= ~HRTIMER_STATE_PINNED; + if (mode & HRTIMER_MODE_PINNED) + timer->state |= HRTIMER_STATE_PINNED; +#endif + first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; @@ -1507,9 +1529,13 @@ bool hrtimer_active(const struct hrtimer *timer) do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - +#ifdef CONFIG_CPU_ISOLATION_OPT + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || base->running == timer) +#else if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) +#endif return true; } while (read_seqcount_retry(&base->seq, seq) || @@ -2082,6 +2108,117 @@ int hrtimers_prepare_cpu(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU +#ifdef CONFIG_CPU_ISOLATION_OPT +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, + bool remove_pinned) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + bool is_hotplug = !cpu_online(old_base->cpu_base->cpu); + + timerqueue_init_head(&pinned); + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + if (is_hotplug) + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as ENQUEUED not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base, HRTIMER_MODE_ABS); + } +} + +static void __migrate_hrtimers(unsigned int scpu, bool remove_pinned) +{ + struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; + int i; + + local_irq_save(flags); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = this_cpu_ptr(&hrtimer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], remove_pinned); + } + + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + hrtimer_update_softirq_timer(new_base, false); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); + + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_restore(flags); +} + +int hrtimers_dead_cpu(unsigned int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); + __migrate_hrtimers(scpu, true); + local_bh_enable(); + return 0; +} + +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); +} + +#else + static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { @@ -2157,6 +2294,8 @@ int hrtimers_dead_cpu(unsigned int scpu) return 0; } +#endif /* CONFIG_CPU_ISOLATION_OPT */ + #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a3ec21be3b140f3cc4c54be07495d1b8f799b4fe..926d0900fa36510e1481d9fa88c89045d06ffde5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1921,6 +1921,65 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU + +#ifdef CONFIG_CPU_ISOLATION_OPT +static void migrate_timer_list(struct timer_base *new_base, + struct hlist_head *head, bool remove_pinned) +{ + struct timer_list *timer; + int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; + + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + internal_add_timer(new_base, timer); + } +} + +static void __migrate_timers(unsigned int cpu, bool remove_pinned) +{ + struct timer_base *old_base; + struct timer_base *new_base; + unsigned long flags; + int b, i; + + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock_irqsave(&new_base->lock, flags); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + /* + * The current CPUs base clock might be stale. Update it + * before moving the timers over. + */ + forward_timer_base(new_base); + + if (!cpu_online(cpu)) + BUG_ON(old_base->running_timer); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i, + remove_pinned); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock_irqrestore(&new_base->lock, flags); + put_cpu_ptr(&timer_bases); + } +} + +#else + static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; @@ -1934,6 +1993,8 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +#endif /* CONFIG_CPU_ISOLATION_OPT */ + int timers_prepare_cpu(unsigned int cpu) { struct timer_base *base; @@ -1949,6 +2010,21 @@ int timers_prepare_cpu(unsigned int cpu) return 0; } +#ifdef CONFIG_CPU_ISOLATION_OPT +int timers_dead_cpu(unsigned int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, true); + return 0; +} + +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(unsigned int *)cpup, false); +} + +#else + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; @@ -1985,6 +2061,8 @@ int timers_dead_cpu(unsigned int cpu) return 0; } +#endif /* CONFIG_CPU_ISOLATION_OPT */ + #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 01bf977090dc2e699b105dd7c7e0c254d98ca95a..9d3ca28c6f8d498ae86ce41fc746e060d12cda53 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -171,6 +172,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); @@ -428,16 +430,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_enable(unsigned int cpu) +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); + unsigned int *enabled = this_cpu_ptr(&watchdog_en); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); + if (*enabled) + return; + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -452,11 +458,24 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); + + /* + * Need to ensure above operations are observed by other CPUs before + * indicating that timer is enabled. This is to synchronize core + * isolation and hotplug. Core isolation will wait for this flag to be + * set. + */ + mb(); + *enabled = 1; } -static void watchdog_disable(unsigned int cpu) +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = per_cpu_ptr(&watchdog_en, cpu); + + if (!*enabled) + return; WARN_ON_ONCE(cpu != smp_processor_id()); @@ -468,6 +487,17 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); + + /* + * No need for barrier here since disabling the watchdog is + * synchronized with hotplug lock + */ + *enabled = 0; +} + +bool watchdog_configured(unsigned int cpu) +{ + return *per_cpu_ptr(&watchdog_en, cpu); } static int softlockup_stop_fn(void *data) diff --git a/mm/vmstat.c b/mm/vmstat.c index a03aa6b3e4dcb638438e969db4f0deb5f8f7ef20..ec58ac28b4f7cab032872693819ac6a759742fa7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1881,7 +1881,7 @@ int vmstat_refresh(struct ctl_table *table, int write, static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1973,7 +1973,8 @@ static void vmstat_shepherd(struct work_struct *w) for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!delayed_work_pending(dw) && need_update(cpu) && + !cpu_isolated(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus();