diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 8f1d6569564c4099deea31b1ebfd2371481ee36a..0977edfc305ee834509992b09959ab7feca4a9f3 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -180,9 +180,42 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU +static ssize_t isolate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t rc; + int cpuid = cpu->dev.id; + unsigned int isolated = cpu_isolated(cpuid); + + rc = snprintf(buf, PAGE_SIZE-2, "%d\n", isolated); + + return rc; +} + +static DEVICE_ATTR_RO(isolate); + +static struct attribute *cpu_isolated_attrs[] = { + &dev_attr_isolate.attr, + NULL +}; + +static struct attribute_group cpu_isolated_attr_group = { + .attrs = cpu_isolated_attrs, +}; +#endif +#endif + static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, +#endif #endif NULL }; @@ -190,6 +223,11 @@ static const struct attribute_group *common_cpu_attr_groups[] = { static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, +#endif #endif NULL }; @@ -220,6 +258,9 @@ static struct cpu_attr cpu_attrs[] = { _CPU_ATTR(online, &__cpu_online_mask), _CPU_ATTR(possible, &__cpu_possible_mask), _CPU_ATTR(present, &__cpu_present_mask), +#ifdef CONFIG_SCHED_WALT + _CPU_ATTR(core_ctl_isolated, &__cpu_isolated_mask), +#endif }; /* @@ -465,6 +506,9 @@ static struct attribute *cpu_root_attrs[] = { &cpu_attrs[0].attr.attr, &cpu_attrs[1].attr.attr, &cpu_attrs[2].attr.attr, +#ifdef CONFIG_SCHED_WALT + &cpu_attrs[3].attr.attr, +#endif &dev_attr_kernel_max.attr, &dev_attr_offline.attr, &dev_attr_isolated.attr, diff --git a/fs/proc/base.c b/fs/proc/base.c index 9b3038f1b9b5051bd666a8257957462196fb2e07..1e12fc8951014806068fab67403e55596adad9b7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include @@ -1573,6 +1574,70 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_SCHED_WALT +static int sched_init_task_load_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + seq_printf(m, "%d\n", sched_get_init_task_load(p)); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_init_task_load_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int init_task_load, err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &init_task_load); + if (err) + goto out; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = sched_set_init_task_load(p, init_task_load); + + put_task_struct(p); + +out: + return err < 0 ? err : count; +} + +static int sched_init_task_load_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_init_task_load_show, inode); +} + +static const struct file_operations proc_pid_sched_init_task_load_operations = { + .open = sched_init_task_load_open, + .read = seq_read, + .write = sched_init_task_load_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { @@ -3199,6 +3264,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_WALT + REG("sched_init_task_load", 00644, proc_pid_sched_init_task_load_operations), +#endif #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8fb893ed205e3c77e82d39ae70e4984e226c55fd..9be54199902bdee7835fd0ce268cb15f1dec72ff 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -71,6 +71,9 @@ enum cpuhp_state { CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, +#ifdef CONFIG_SCHED_WALT + CPUHP_CORE_CTL_ISOLATION_DEAD, +#endif CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f0d895d6ac39f1b1d8a45c54ec00eb66ec113224..34d869d18e168d9892757cd91c3b127904cc5a8a 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -55,6 +55,7 @@ extern unsigned int nr_cpu_ids; * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration + * cpu_isolated_mask- has bit 'cpu' set iff cpu isolated * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * @@ -96,6 +97,11 @@ extern struct cpumask __cpu_active_mask; #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#ifdef CONFIG_SCHED_WALT +extern struct cpumask __cpu_isolated_mask; +#define cpu_isolated_mask ((const struct cpumask *)&__cpu_isolated_mask) +#endif + extern atomic_t __num_online_cpus; #if NR_CPUS > 1 @@ -129,6 +135,22 @@ static inline unsigned int num_online_cpus(void) #define cpu_active(cpu) ((cpu) == 0) #endif +#if defined(CONFIG_SCHED_WALT) && NR_CPUS > 1 +#define num_isolated_cpus() cpumask_weight(cpu_isolated_mask) +#define num_online_uniso_cpus() \ +({ \ + cpumask_t mask; \ + \ + cpumask_andnot(&mask, cpu_online_mask, cpu_isolated_mask); \ + cpumask_weight(&mask); \ +}) +#define cpu_isolated(cpu) cpumask_test_cpu((cpu), cpu_isolated_mask) +#else /* !CONFIG_SCHED_WALT || NR_CPUS == 1 */ +#define num_isolated_cpus() 0U +#define num_online_uniso_cpus() num_online_cpus() +#define cpu_isolated(cpu) 0U +#endif + extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) @@ -811,6 +833,9 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#ifdef CONFIG_SCHED_WALT +#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask) +#endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); @@ -851,6 +876,17 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } +#ifdef CONFIG_SCHED_WALT +static inline void +set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, &__cpu_isolated_mask); + else + cpumask_clear_cpu(cpu, &__cpu_isolated_mask); +} +#endif + /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * diff --git a/include/linux/device.h b/include/linux/device.h index 5dc0f81e4f9d41d9d9b59cd1197fb7c2d61ff6a6..de0c8ead46965c7dab3c31fd6f57bff8ad0eccb0 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -813,6 +813,7 @@ static inline bool device_supports_offline(struct device *dev) void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); +void lock_device_hotplug_assert(void); int device_offline(struct device *dev); int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7f1b8549ebcee19bec2c919f71c17b1d610108cd..aa0cd15cb05c9b2e754519b84d70979b552b6323 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -74,6 +74,7 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree + * 0x02 timer is pinned to a cpu * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free @@ -93,6 +94,8 @@ enum hrtimer_restart { */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_PINNED_SHIFT 1 +#define HRTIMER_STATE_PINNED (1 << HRTIMER_PINNED_SHIFT) /** * struct hrtimer - the basic hrtimer structure @@ -367,6 +370,9 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) /* Exported timer functions: */ +/* To be used from cpusets, only */ +extern void hrtimer_quiesce_cpu(void *cpup); + /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 750c7f395ca907f772ad8e784e6df32e473ead72..baa964d5d38c178df68abc4521aa14c1c894a32b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -13,6 +13,11 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +#ifdef CONFIG_SCHED_WALT +extern void watchdog_enable(unsigned int cpu); +extern void watchdog_disable(unsigned int cpu); +extern bool watchdog_configured(unsigned int cpu); +#endif void lockup_detector_soft_poweroff(void); void lockup_detector_cleanup(void); bool is_hardlockup(void); @@ -37,6 +42,22 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } static inline void lockup_detector_cleanup(void) { } +#ifdef CONFIG_SCHED_WALT +static inline void watchdog_enable(unsigned int cpu) +{ +} +static inline void watchdog_disable(unsigned int cpu) +{ +} +static inline bool watchdog_configured(unsigned int cpu) +{ + /* + * Pretend the watchdog is always configured. + * We will be waiting for the watchdog to be enabled in core isolation + */ + return true; +} +#endif #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/include/linux/sched.h b/include/linux/sched.h index 53198ac3d154ac270a19b5af9050bbda65a1f572..66e26f69967c4e2f91801da719358fc34272a435 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -213,6 +213,50 @@ struct io_uring_task; /* Task command name length: */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SCHED_WALT) +extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); +extern int sched_isolate_cpu(int cpu); +extern int sched_unisolate_cpu(int cpu); +extern int sched_unisolate_cpu_unlocked(int cpu); +#else +static inline int sched_isolate_count(const cpumask_t *mask, + bool include_offline) +{ + cpumask_t count_mask; + + if (include_offline) + cpumask_andnot(&count_mask, mask, cpu_online_mask); + else + return 0; + + return cpumask_weight(&count_mask); +} + +static inline int sched_isolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu_unlocked(int cpu) +{ + return 0; +} +#endif + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -495,6 +539,53 @@ struct sched_entity { #endif }; +#ifdef CONFIG_SCHED_WALT +extern void sched_exit(struct task_struct *p); +extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct); +extern u32 sched_get_init_task_load(struct task_struct *p); +extern void free_task_load_ptrs(struct task_struct *p); +#define RAVG_HIST_SIZE_MAX 5 +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the current window + * + * 'prev_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the previous window + * + * 'curr_window' represents the sum of all entries in curr_window_cpu + * + * 'prev_window' represents the sum of all entries in prev_window_cpu + * + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 *curr_window_cpu, *prev_window_cpu; + u32 curr_window, prev_window; + u16 active_windows; + u16 demand_scaled; +}; +#else +static inline void sched_exit(struct task_struct *p) { } +static inline void free_task_load_ptrs(struct task_struct *p) { } +#endif /* CONFIG_SCHED_WALT */ + struct sched_rt_entity { struct list_head run_list; unsigned long timeout; @@ -700,6 +791,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; + u64 last_sleep_ts; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/core_ctl.h b/include/linux/sched/core_ctl.h new file mode 100644 index 0000000000000000000000000000000000000000..57f14b52c9412af4695c66acc78ef76a21c8a3c6 --- /dev/null +++ b/include/linux/sched/core_ctl.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, 2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +struct core_ctl_notif_data { + unsigned int nr_big; + unsigned int coloc_load_pct; +}; + +extern void core_ctl_check(u64 wallclock); + +#ifdef CONFIG_SCHED_WALT +int core_ctl_set_boost(bool boost); +void core_ctl_notifier_register(struct notifier_block *n); +void core_ctl_notifier_unregister(struct notifier_block *n); +#else +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +static inline void core_ctl_notifier_register(struct notifier_block *n) {} +static inline void core_ctl_notifier_unregister(struct notifier_block *n) {} +#endif +#endif diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 3ed5aa18593f2aa4d7274e6def1c970117568376..c7cf63236f5bc73732ed0831c106116080e4e567 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,6 +9,8 @@ */ #define SCHED_CPUFREQ_IOWAIT (1U << 0) +#define SCHED_CPUFREQ_WALT (1U << 1) +#define SCHED_CPUFREQ_CONTINUE (1U << 2) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cc9f393e2a70603ed16974afeb60aa6569bcd26d..049a1bba90bce3b90b4c2cd2faaa68b35e2e6928 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -28,10 +28,25 @@ extern void __init housekeeping_init(void); #else +#ifdef CONFIG_SCHED_WALT +static inline int housekeeping_any_cpu(enum hk_flags flags) +{ + cpumask_t available; + int cpu; + + cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask); + cpu = cpumask_any(&available); + if (cpu >= nr_cpu_ids) + cpu = smp_processor_id(); + + return cpu; +} +#else static inline int housekeeping_any_cpu(enum hk_flags flags) { return smp_processor_id(); } +#endif static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { @@ -54,7 +69,11 @@ static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, flags); #endif +#ifdef CONFIG_SCHED_WALT + return !cpu_isolated(cpu); +#else return true; +#endif } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43ab5b48378eea1bbb0441bb8a520f..ca8b0d1ccf942b59ff316bb3ffc4433ade1eb140 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -21,6 +21,15 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_get_cpu_util(int cpu); +#else +static inline unsigned int sched_get_cpu_util(int cpu) +{ + return 0; +} +#endif + static inline int sched_info_on(void) { #ifdef CONFIG_SCHEDSTATS diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca59eb04dab769aad927fcfad7057ca..210909cd4141da8fee7055484987e9ae019d1ce5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -30,6 +30,16 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_cpu_high_irqload; + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 63ea9aff368f0daee3f876f20e8a90f6c0fef9ba..6037d00555c769cba2621529a38d13be72417cd4 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -32,6 +32,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); @@ -80,6 +81,14 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline int stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) + return stop_one_cpu(raw_smp_processor_id(), fn, arg); + return -ENOENT; +} + #endif /* CONFIG_SMP */ /* diff --git a/include/linux/timer.h b/include/linux/timer.h index d10bc7e73b41eff9a20900ae63f0c1538b71078b..8cb904ce3dca1966280e21df8930181cdba425d8 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -180,6 +180,9 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +/* To be used from cpusets, only */ +extern void timer_quiesce_cpu(void *cpup); + extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6c562e0b59fee69072de6424f3d72..245671bc6292a581372be1400a48191892d97bff 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -600,6 +600,162 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); +#ifdef CONFIG_SCHED_WALT + +TRACE_EVENT(core_ctl_eval_need, + + TP_PROTO(unsigned int cpu, unsigned int old_need, + unsigned int new_need, unsigned int updated), + TP_ARGS(cpu, old_need, new_need, updated), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, old_need) + __field(u32, new_need) + __field(u32, updated) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->old_need = old_need; + __entry->new_need = new_need; + __entry->updated = updated; + ), + TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu, + __entry->old_need, __entry->new_need, __entry->updated) +); + +TRACE_EVENT(core_ctl_set_busy, + + TP_PROTO(unsigned int cpu, unsigned int busy, + unsigned int old_is_busy, unsigned int is_busy), + TP_ARGS(cpu, busy, old_is_busy, is_busy), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, busy) + __field(u32, old_is_busy) + __field(u32, is_busy) + __field(bool, high_irqload) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->busy = busy; + __entry->old_is_busy = old_is_busy; + __entry->is_busy = is_busy; + __entry->high_irqload = sched_cpu_high_irqload(cpu); + ), + TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u high_irqload=%d", + __entry->cpu, __entry->busy, __entry->old_is_busy, + __entry->is_busy, __entry->high_irqload) +); + +TRACE_EVENT(core_ctl_set_boost, + + TP_PROTO(u32 refcount, s32 ret), + TP_ARGS(refcount, ret), + TP_STRUCT__entry( + __field(u32, refcount) + __field(s32, ret) + ), + TP_fast_assign( + __entry->refcount = refcount; + __entry->ret = ret; + ), + TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret) +); + +TRACE_EVENT(core_ctl_update_nr_need, + + TP_PROTO(int cpu, int nr_need, int prev_misfit_need, + int nrrun, int max_nr, int nr_prev_assist), + + TP_ARGS(cpu, nr_need, prev_misfit_need, nrrun, max_nr, nr_prev_assist), + + TP_STRUCT__entry( + __field( int, cpu) + __field( int, nr_need) + __field( int, prev_misfit_need) + __field( int, nrrun) + __field( int, max_nr) + __field( int, nr_prev_assist) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_need = nr_need; + __entry->prev_misfit_need = prev_misfit_need; + __entry->nrrun = nrrun; + __entry->max_nr = max_nr; + __entry->nr_prev_assist = nr_prev_assist; + ), + + TP_printk("cpu=%d nr_need=%d prev_misfit_need=%d nrrun=%d max_nr=%d nr_prev_assist=%d", + __entry->cpu, __entry->nr_need, __entry->prev_misfit_need, + __entry->nrrun, __entry->max_nr, __entry->nr_prev_assist) +); +/* + * Tracepoint for sched_get_nr_running_avg + */ +TRACE_EVENT(sched_get_nr_running_avg, + + TP_PROTO(int cpu, int nr, int nr_misfit, int nr_max), + + TP_ARGS(cpu, nr, nr_misfit, nr_max), + + TP_STRUCT__entry( + __field( int, cpu) + __field( int, nr) + __field( int, nr_misfit) + __field( int, nr_max) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr = nr; + __entry->nr_misfit = nr_misfit; + __entry->nr_max = nr_max; + ), + + TP_printk("cpu=%d nr=%d nr_misfit=%d nr_max=%d", + __entry->cpu, __entry->nr, __entry->nr_misfit, __entry->nr_max) +); + +/* + * sched_isolate - called when cores are isolated/unisolated + * + * @acutal_mask: mask of cores actually isolated/unisolated + * @req_mask: mask of cores requested isolated/unisolated + * @online_mask: cpu online mask + * @time: amount of time in us it took to isolate/unisolate + * @isolate: 1 if isolating, 0 if unisolating + * + */ +TRACE_EVENT(sched_isolate, + + TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus, + u64 start_time, unsigned char isolate), + + TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate), + + TP_STRUCT__entry( + __field(u32, requested_cpu) + __field(u32, isolated_cpus) + __field(u32, time) + __field(unsigned char, isolate) + ), + + TP_fast_assign( + __entry->requested_cpu = requested_cpu; + __entry->isolated_cpus = isolated_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->isolate = isolate; + ), + + TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d", + __entry->requested_cpu, __entry->isolated_cpus, + __entry->time, __entry->isolate) +); + +#endif + /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. diff --git a/include/trace/events/walt.h b/include/trace/events/walt.h new file mode 100644 index 0000000000000000000000000000000000000000..603889af1de21dd6b9f0af42ddb2b8e486f8e807 --- /dev/null +++ b/include/trace/events/walt.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM walt + +#if !defined(_TRACE_WALT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WALT_H + +#include +#include + +struct rq; +extern const char *task_event_names[]; + +#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_WALT) +static inline void __window_data(u32 *dst, u32 *src) +{ + if (src) + memcpy(dst, src, nr_cpu_ids * sizeof(u32)); + else + memset(dst, 0, nr_cpu_ids * sizeof(u32)); +} + +struct trace_seq; +const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + seq_buf_used(&p->seq); + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%u ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} + +static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new) +{ + if (curr) + if (new) + return rq->nt_curr_runnable_sum; + else + return rq->curr_runnable_sum; + else + if (new) + return rq->nt_prev_runnable_sum; + else + return rq->prev_runnable_sum; +} + +#endif + +TRACE_EVENT(sched_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + enum task_event evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, runtime ) + __field( int, samples ) + __field(enum task_event, evt ) + __field(unsigned int, demand ) + __array( u32, hist, RAVG_HIST_SIZE_MAX) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %s demand %u" + " (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, + task_event_names[__entry->evt], __entry->demand, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(sched_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) + __field( u64, wallclock ) + __field( u64, mark_start ) + __field( u64, delta_m ) + __field( u64, win_start ) + __field( u64, delta ) + __field( u64, irqtime ) + __field(enum task_event, evt ) + __field(unsigned int, demand ) + __field(unsigned int, sum ) + __field( int, cpu ) + __field( u64, rq_cs ) + __field( u64, rq_ps ) + __field( u32, curr_window ) + __field( u32, prev_window ) + __dynamic_array(u32, curr_sum, nr_cpu_ids ) + __dynamic_array(u32, prev_sum, nr_cpu_ids ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cluster->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->rq_cs = rq->curr_runnable_sum; + __entry->rq_ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu); + __window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu); + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u", + __entry->wallclock, __entry->win_start, __entry->delta, + task_event_names[__entry->evt], __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->rq_cs, __entry->rq_ps, __entry->curr_window, + __window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids), + __entry->prev_window, + __window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids), + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows) +); + +#endif /* _TRACE_WALT_H */ + +/* This part must be outside protection */ +#include diff --git a/init/Kconfig b/init/Kconfig index fc4c9f416fadbb36ca025d824805ccb9c641a9d3..8b20edacf9217c2ced62e63344c5c338e03e4832 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -526,6 +526,15 @@ config SCHED_THERMAL_PRESSURE This requires the architecture to implement arch_set_thermal_pressure() and arch_get_thermal_pressure(). +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/cpu.c b/kernel/cpu.c index 67c22941b5f275e05fe4e2de549e427f1435b96f..fde59877742d89c6a7ac0232936dc36fa474cc53 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1052,6 +1052,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -2495,6 +2498,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +struct cpumask __cpu_isolated_mask __read_mostly; +EXPORT_SYMBOL(__cpu_isolated_mask); + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); @@ -2513,6 +2519,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(&__cpu_isolated_mask, src); +} + void set_cpu_online(unsigned int cpu, bool online) { /* diff --git a/kernel/exit.c b/kernel/exit.c index d13d67fc5f4e2085f93c46b77440ca1eeb908833..795e16ecc422a09980726dc1c2f179b6023b4746 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -765,6 +765,7 @@ void __noreturn do_exit(long code) io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ + sched_exit(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/fork.c b/kernel/fork.c index 39b1783a7613e2bdc67c8201af1f91d8c401b2fc..7528c3f3736e7f4f75bd47e1b2a96af5097237a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2369,6 +2369,7 @@ static __latent_entropy struct task_struct *copy_process( perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); + free_task_load_ptrs(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 02236b13b359974e95a2e3ca4ec3f0cadfd0962a..89981adfb7483381e4d0e51a76f570a6348991b1 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "internals.h" @@ -58,6 +59,9 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity; bool brokeaff = false; int err; +#ifdef CONFIG_SCHED_WALT + struct cpumask available_cpus; +#endif /* * IRQ chip might be already torn down, but the irq descriptor is @@ -110,7 +114,17 @@ static bool migrate_one_irq(struct irq_desc *desc) if (maskchip && chip->irq_mask) chip->irq_mask(d); +#ifdef CONFIG_SCHED_WALT + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; +#endif + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { +#ifdef CONFIG_SCHED_WALT + const struct cpumask *default_affinity; +#endif + /* * If the interrupt is managed, then shut it down and leave * the affinity untouched. @@ -120,16 +134,47 @@ static bool migrate_one_irq(struct irq_desc *desc) irq_shutdown_and_deactivate(desc); return false; } + +#ifdef CONFIG_SCHED_WALT + default_affinity = desc->affinity_hint ? : irq_default_affinity; + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_intersects(&available_cpus, default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + default_affinity); + else if (cpumask_empty(&available_cpus)) + affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); +#else affinity = cpu_online_mask; +#endif brokeaff = true; } /* - * Do not set the force argument of irq_do_set_affinity() as this + * Do not set the force argument of irq_set_affinity_locked() as this * disables the masking of offline CPUs from the supplied affinity * mask and therefore might keep/reassign the irq to the outgoing * CPU. */ - err = irq_do_set_affinity(d, affinity, false); + err = irq_set_affinity_locked(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc664be5d3a4a1ac83ec3d365c6370..b6e4c4930b0421dfb3bc962ed3013d84b04d9609 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -154,6 +154,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (err) goto free_cpumask; + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862f72486b4b6e57711d530ceaeca18..b74f3553f0b510582e57b7a25e6400dee56189c2 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -27,6 +27,7 @@ obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o +obj-$(CONFIG_SCHED_WALT) += walt.o core_ctl.o sched_avg.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4551d1736fa38c354c9ab500f5184aa845f10a6..e8c671e7d3b02a65c0c22df8a25cbf40f63fb2c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include @@ -26,6 +28,7 @@ #include "pelt.h" #include "smp.h" +#include "walt.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -1885,6 +1888,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq_flags rf; struct rq *rq; int ret = 0; +#ifdef CONFIG_SCHED_WALT + cpumask_t allowed_mask; +#endif rq = task_rq_lock(p, &rf); update_rq_clock(rq); @@ -1908,6 +1914,20 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask); + + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + cpumask_and(&allowed_mask, cpu_valid_mask, new_mask); + dest_cpu = cpumask_any(&allowed_mask); + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + ret = -EINVAL; + goto out; + } + } +#else /* * Picking a ~random cpu helps in cases where we are changing affinity * for groups of tasks (ie. cpuset), so that load balancing is not @@ -1918,6 +1938,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ret = -EINVAL; goto out; } +#endif do_set_cpus_allowed(p, new_mask); @@ -1932,8 +1953,13 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, } /* Can the task run on the task's current CPU? If so, we're done */ +#ifdef CONFIG_SCHED_WALT + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) + goto out; +#else if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; +#endif if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; @@ -2007,6 +2033,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -2284,12 +2311,15 @@ EXPORT_SYMBOL_GPL(kick_process); * select_task_rq() below may allow selection of !active CPUs in order * to satisfy the above rules. */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; + enum { cpuset, possible, fail, bug } state = cpuset; int dest_cpu; +#ifdef CONFIG_SCHED_WALT + int isolated_candidate = -1; +#endif /* * If the node that the CPU is on has been offlined, cpu_to_node() @@ -2303,6 +2333,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } @@ -2313,7 +2345,18 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; +#ifdef CONFIG_SCHED_WALT + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; +#endif goto out; } @@ -2332,6 +2375,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) break; case fail: +#ifdef CONFIG_SCHED_WALT + + allow_iso = true; + state = bug; + break; +#else + /* fall through */ +#endif + + case bug: BUG(); break; } @@ -2359,6 +2412,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -2376,8 +2431,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!is_cpu_allowed(p, cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + if (unlikely(!is_cpu_allowed(p, cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -2794,6 +2850,26 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * accesses to the task state; see try_to_wake_up() and set_current_state(). */ +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +/* utility function to update walt signals at wakeup */ +static inline void walt_try_to_wake_up(struct task_struct *p) +{ + struct rq *rq = cpu_rq(task_cpu(p)); + struct rq_flags rf; + u64 wallclock; + + rq_lock_irqsave(rq, &rf); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + rq_unlock_irqrestore(rq, &rf); +} +#else +#define walt_try_to_wake_up(a) {} +#endif +#endif + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -2928,6 +3004,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_acquire__after_ctrl_dep(); + walt_try_to_wake_up(p); + /* * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq * == 0), which means we need to do an enqueue, change p->state to @@ -3233,6 +3311,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; + init_new_task_load(p); __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -3363,6 +3442,8 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); + mark_task_starting(p); + activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3905,7 +3986,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3995,6 +4076,7 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + u64 wallclock; unsigned long thermal_pressure; arch_scale_freq_tick(); @@ -4002,6 +4084,9 @@ void scheduler_tick(void) rq_lock(rq, &rf); + set_window_start(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_rq_clock(rq); thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); @@ -4423,6 +4508,7 @@ static void __sched notrace __schedule(bool preempt) struct rq_flags rf; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -4505,7 +4591,13 @@ static void __sched notrace __schedule(bool preempt) clear_tsk_need_resched(prev); clear_preempt_need_resched(); + wallclock = sched_ktime_clock(); if (likely(prev != next)) { + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; + + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -4535,6 +4627,7 @@ static void __sched notrace __schedule(bool preempt) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } @@ -5527,10 +5620,11 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) * Return: 0 on success. An error code otherwise. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) + const struct sched_param *param) { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally @@ -5889,6 +5983,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; +#ifdef CONFIG_SCHED_WALT + int dest_cpu; + cpumask_t allowed_mask; +#endif rcu_read_lock(); @@ -5950,20 +6048,30 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); + if (dest_cpu < nr_cpu_ids) { +#endif + retval = __set_cpus_allowed_ptr(p, new_mask, true); + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } } +#ifdef CONFIG_SCHED_WALT + } else { + retval = -EINVAL; } +#endif + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -6027,6 +6135,16 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + +#ifdef CONFIG_SCHED_WALT + /* The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); +#endif + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -6714,20 +6832,66 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) BUG(); } +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU +/* + * Remove a task from the runqueue and pretend that it's migrating. This + * should prevent migrations for the detached task and disallow further + * changes to tsk_cpus_allowed. + */ +void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ + lockdep_assert_held(&rq->lock); + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(rq, p, 0); + list_add(&p->se.group_node, tasks); +} + +void attach_tasks_core(struct list_head *tasks, struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_held(&rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + BUG_ON(task_rq(p) != rq); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } +} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SCHED_WALT */ + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; struct rq_flags orf = *rf; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + LIST_HEAD(tasks); + cpumask_t avail_cpus; + +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); +#else + cpumask_copy(&avail_cpus, cpu_online_mask); +#endif /* * Fudge the rq selection such that the below task selection loop @@ -6750,13 +6914,20 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) for (;;) { /* * There's this thread running, bail when that's the only - * remaining thread: + * remaining thread. */ if (rq->nr_running == 1) break; next = __pick_migrate_task(rq); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_mask)) { + detach_one_task_core(next, rq, &tasks); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -6769,31 +6940,278 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); /* * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. + * However, during cpu isolation the load balancer might have + * interferred since we don't stop all CPUs. Ignore warning for + * this case. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + WARN_ON(migrate_pinned_tasks); raw_spin_unlock(&next->pi_lock); continue; } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { rq_unlock(rq, rf); rq = dead_rq; *rf = orf; rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop; + + if (num_pinned_kthreads > 1) + attach_tasks_core(&tasks, rq); } + +void set_rq_online(struct rq *rq); +void set_rq_offline(struct rq *rq); + +#ifdef CONFIG_SCHED_WALT + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + watchdog_disable(cpu); + + local_irq_disable(); + + irq_migrate_all_off_this_cpu(); + + flush_smp_call_function_from_idle(); + + /* Update our root-domain */ + rq_lock(rq, &rf); + + /* + * Temporarily mark the rq as offline. This will allow us to + * move tasks off the CPU. + */ + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, &rf, false); + + if (rq->rd) + set_rq_online(rq); + rq_unlock(rq, &rf); + + local_irq_enable(); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq; + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + cpu_maps_update_begin(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) || + !cpu_online(cpu) || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + rq = cpu_rq(cpu); + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + --cpu_isolation_vote[cpu]; + ret_code = -EINVAL; + goto out; + } + + /* + * There is a race between watchdog being enabled by hotplug and + * core isolation disabling the watchdog. When a CPU is hotplugged in + * and the hotplug lock has been released the watchdog thread might + * not have run yet to enable the watchdog. + * We have to wait for the watchdog to be enabled before proceeding. + */ + if (!watchdog_configured(cpu)) { + msleep(20); + if (!watchdog_configured(cpu)) { + --cpu_isolation_vote[cpu]; + ret_code = -EBUSY; + goto out; + } + } + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + watchdog_disable(cpu); + irq_lock_sparse(); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + irq_unlock_sparse(); + + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + cpu_maps_update_done(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + u64 start_time = 0; + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) + || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + cpu_maps_update_begin(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + cpu_maps_update_done(); + return ret_code; +} + +#endif /* CONFIG_SCHED_WALT */ + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -6949,6 +7367,11 @@ int sched_cpu_deactivate(unsigned int cpu) static void sched_rq_cpu_starting(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; update_max_interval(); @@ -6971,11 +7394,12 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); + migrate_tasks(rq, &rf, true); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); @@ -7000,6 +7424,8 @@ void __init sched_init_smp(void) sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + update_cluster_topology(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); @@ -7062,6 +7488,8 @@ void __init sched_init(void) wait_bit_init(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7173,6 +7601,7 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + walt_sched_init_rq(rq); INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7203,6 +7632,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -8481,3 +8911,44 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_WALT +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + u64 wallclock; + + rq = task_rq_lock(p, &rf); + + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + /* + * task's contribution is already removed from the + * cumulative window demand in dequeue. As the + * task's stats are reset, the next enqueue does + * not change the cumulative window demand. + */ + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &rf); + free_task_load_ptrs(p); +} +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 0000000000000000000000000000000000000000..b5991805fe802a37dd53fa2cd3825b5db8a9225c --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1307 @@ +/* Copyright (c) 2014-2018, 2020, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sched.h" +#include "walt.h" + +#define MAX_CPUS_PER_CLUSTER 6 +#define MAX_CLUSTERS 3 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + unsigned int nr_not_preferred_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + unsigned int nr_prev_assist; + unsigned int nr_prev_assist_thresh; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool enable; + int nrrun; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +ATOMIC_NOTIFIER_HEAD(core_ctl_notifier); +static unsigned int last_nr_big; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); +static void cpuset_next(struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + cpuset_next(state); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + cpuset_next(state); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_nr_prev_assist_thresh(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->nr_prev_assist_thresh); +} + +static ssize_t store_nr_prev_assist_thresh(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->nr_prev_assist_thresh = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNot preferred: %u\n", + c->not_preferred); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + int not_preferred_count = 0; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != state->num_cpus) + return -EINVAL; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + c->not_preferred = val[i]; + not_preferred_count += !!val[i]; + } + state->nr_not_preferred_cpus = not_preferred_count; + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + int i; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU#%d: %u\n", c->cpu, c->not_preferred); + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(nr_prev_assist_thresh); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &nr_prev_assist_thresh.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +static struct sched_avg_stats nr_stats[NR_CPUS]; + +/* + * nr_need: + * Number of tasks running on this cluster plus + * tasks running on higher capacity clusters. + * To find out CPUs needed from this cluster. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 4 small tasks running on min capacity CPUs + * and 2 big tasks running on 2 max capacity + * CPUs, nr_need has to be 6 for min capacity + * cluster and 2 for max capacity cluster. + * This is because, min capacity cluster has to + * account for tasks running on max capacity + * cluster, so that, the min capacity cluster + * can be ready to accommodate tasks running on max + * capacity CPUs if the demand of tasks goes down. + */ +static int compute_cluster_nr_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_need = 0; + + for_each_cluster(cluster, index) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_need += nr_stats[cpu].nr; + } + + return nr_need; +} + +/* + * prev_misfit_need: + * Tasks running on smaller capacity cluster which + * needs to be migrated to higher capacity cluster. + * To find out how many tasks need higher capacity CPUs. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 2 small tasks and 2 big tasks running on + * min capacity CPUs and no tasks running on + * max cpacity, prev_misfit_need of min capacity + * cluster will be 0 and prev_misfit_need of + * max capacity cluster will be 2. + */ +static int compute_prev_cluster_misfit_need(int index) +{ + int cpu; + struct cluster_data *prev_cluster; + int prev_misfit_need = 0; + + /* + * Lowest capacity cluster does not have to + * accommodate any misfit tasks. + */ + if (index == 0) + return 0; + + prev_cluster = &cluster_state[index - 1]; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + prev_misfit_need += nr_stats[cpu].nr_misfit; + + return prev_misfit_need; +} + +static int compute_cluster_max_nr(int index) +{ + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + int max_nr = 0; + + for_each_cpu(cpu, &cluster->cpu_mask) + max_nr = max(max_nr, nr_stats[cpu].nr_max); + + return max_nr; +} + +static int cluster_real_big_tasks(int index) +{ + int nr_big = 0; + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + + if (index == 0) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr_misfit; + } else { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr; + } + + return nr_big; +} + +/* + * prev_nr_need_assist: + * Tasks that are eligible to run on the previous + * cluster but cannot run because of insufficient + * CPUs there. prev_nr_need_assist is indicative + * of number of CPUs in this cluster that should + * assist its previous cluster to makeup for + * insufficient CPUs there. + * + * For example: + * On tri-cluster system with 4 min capacity + * CPUs, 3 intermediate capacity CPUs and 1 + * max capacity CPU, if there are 4 small + * tasks running on min capacity CPUs, 4 big + * tasks running on intermediate capacity CPUs + * and no tasks running on max capacity CPU, + * prev_nr_need_assist for min & max capacity + * clusters will be 0, but, for intermediate + * capacity cluster prev_nr_need_assist will + * be 1 as it has 3 CPUs, but, there are 4 big + * tasks to be served. + */ +static int prev_cluster_nr_need_assist(int index) +{ + int need = 0; + int cpu; + struct cluster_data *prev_cluster; + + if (index == 0) + return 0; + + index--; + prev_cluster = &cluster_state[index]; + + /* + * Next cluster should not assist, while there are isolated cpus + * in this cluster. + */ + if (prev_cluster->nr_isolated_cpus) + return 0; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + need += nr_stats[cpu].nr; + + need += compute_prev_cluster_misfit_need(index); + + if (need > prev_cluster->active_cpus) + need = need - prev_cluster->active_cpus; + else + need = 0; + + return need; +} + +static void update_running_avg(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + int big_avg = 0; + + sched_get_nr_running_avg(nr_stats); + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + int nr_need, prev_misfit_need; + + if (!cluster->inited) + continue; + + nr_need = compute_cluster_nr_need(index); + prev_misfit_need = compute_prev_cluster_misfit_need(index); + + + cluster->nrrun = nr_need + prev_misfit_need; + cluster->max_nr = compute_cluster_max_nr(index); + cluster->nr_prev_assist = prev_cluster_nr_need_assist(index); + + trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need, + prev_misfit_need, + cluster->nrrun, cluster->max_nr, + cluster->nr_prev_assist); + + big_avg += cluster_real_big_tasks(index); + } + spin_unlock_irqrestore(&state_lock, flags); + + last_nr_big = big_avg; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* + * unisolate as many cores as the previous cluster + * needs assistance with. + */ + if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh) + new_need = new_need + cluster->nr_prev_assist; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + bool old_is_busy = c->is_busy; + + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + + trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy, + c->is_busy); + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + /* + * When there is no change in need and there are no more + * active CPUs than currently needed, just update the + * need time stamp and return. + */ + if (new_need == last_need && new_need == cluster->active_cpus) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster = NULL; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + ret = -EINVAL; + break; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) { + index = 0; + for_each_cluster(cluster, index) + apply_need(cluster); + } + + if (cluster) + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&core_ctl_notifier, n); +} + +void core_ctl_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&core_ctl_notifier, n); +} + +static void core_ctl_call_notifier(void) +{ + struct core_ctl_notif_data ndata; + struct notifier_block *nb; + + /* + * Don't bother querying the stats when the notifier + * chain is empty. + */ + rcu_read_lock(); + nb = rcu_dereference_raw(core_ctl_notifier.head); + rcu_read_unlock(); + + if (!nb) + return; + + ndata.nr_big = last_nr_big; + + atomic_notifier_call_chain(&core_ctl_notifier, 0, &ndata); +} + +void core_ctl_check(u64 window_start) +{ + int cpu; + struct cpu_data *c; + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + + if (unlikely(!initialized)) + return; + + if (window_start == core_ctl_check_timestamp) + return; + + core_ctl_check_timestamp = window_start; + + spin_lock_irqsave(&state_lock, flags); + for_each_possible_cpu(cpu) { + + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + + if (!cluster || !cluster->inited) + continue; + + c->busy = sched_get_cpu_util(cpu); + } + spin_unlock_irqrestore(&state_lock, flags); + + update_running_avg(); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } + + core_ctl_call_notifier(); +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void cpuset_next(struct cluster_data *cluster) { } + +static bool should_we_isolate(int cpu, struct cluster_data *cluster) +{ + return true; +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + bool first_pass = cluster->nr_not_preferred_cpus; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't isolate busy CPUs. */ + if (c->is_busy) + continue; + + /* + * We isolate only the not_preferred CPUs. If none + * of the CPUs are selected as not_preferred, then + * all CPUs are eligible for isolation. + */ + if (cluster->nr_not_preferred_cpus && !c->not_preferred) + continue; + + if (!should_we_isolate(c->cpu, cluster)) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +again: + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + if (first_pass && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + if (first_pass && cluster->active_cpus > cluster->max_cpus) { + first_pass = false; + goto again; + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int isolation_cpuhp_state(unsigned int cpu, bool online) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup = false, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return 0; + + if (online) { + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + } else { + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return 0; +} + +static int core_ctl_isolation_online_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, true); +} + +static int core_ctl_isolation_dead_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, false); +} + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nr_prev_assist_thresh = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + cluster->nr_not_preferred_cpus = 0; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int __init core_ctl_init(void) +{ + struct sched_cluster *cluster; + int ret; + + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "core_ctl/isolation:online", + core_ctl_isolation_online_cpu, NULL); + + cpuhp_setup_state_nocalls(CPUHP_CORE_CTL_ISOLATION_DEAD, + "core_ctl/isolation:dead", + NULL, core_ctl_isolation_dead_cpu); + + for_each_sched_cluster(cluster) { + ret = cluster_init(&cluster->cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + } + + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100644 index 0000000000000000000000000000000000000000..98d7cb3e899bec93b6b62c56547bca103ee100f1 --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTL +void core_ctl_check(u64 wallclock); +int core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +#endif +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae0868756819d096717422e67ccda7998..5bc0dca90f7b95f83cae8e87a6046794123190e8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -287,6 +287,10 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); +#ifdef CONFIG_SCHED_WALT + return cpu_util_freq_walt(sg_cpu->cpu); +#endif + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); } @@ -520,7 +524,8 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) ignore_dl_rate_limit(sg_cpu, sg_policy); - if (sugov_should_update_freq(sg_policy, time)) { + if (sugov_should_update_freq(sg_policy, time) && + !(flags & SCHED_CPUFREQ_CONTINUE)) { next_f = sugov_next_freq_shared(sg_cpu, time); if (sg_policy->policy->fast_switch_enabled) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d230045240246e34f5a9ec099e62e1fde0fc..cf87d3fff5dd1e27ad7fd2a2233240afed2434b4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@ * Simple CPU accounting cgroup controller */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -52,11 +53,18 @@ void irqtime_account_irq(struct task_struct *curr) struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; @@ -70,6 +78,13 @@ void irqtime_account_irq(struct task_struct *curr) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); +#ifdef CONFIG_SCHED_WALT + else + account = false; + + if (account) + sched_account_irqtime(cpu, curr, delta, wallclock); +#endif } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8255267ce32369382041ce74a6db5ab344d6f360..2a64cced37a59af5d54856fdac734804d0d35562 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ */ #include "sched.h" #include "pelt.h" +#include "walt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1440,6 +1441,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1454,6 +1456,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -2547,6 +2550,9 @@ const struct sched_class dl_sched_class .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; int sched_dl_global_validate(void) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 70a57827243633b8cd1ebce1318844714cdac15a..e5af311230be068e252239518a83d6930c2ed842 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -715,6 +715,17 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); +#ifdef CONFIG_SCHED_WALT + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + SEQ_printf(m, " .%-30s: %llu\n", "walt_stats.cumulative_runnable_avg", + rq->walt_stats.cumulative_runnable_avg_scaled); +#endif #undef P #undef PN @@ -791,6 +802,12 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_WALT + P(sched_init_task_load_windows); + P(min_capacity); + P(max_capacity); + P(sched_ravg_window); +#endif #undef PN #undef P @@ -983,6 +1000,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(se.statistics.nr_wakeups_passive); P_SCHEDSTAT(se.statistics.nr_wakeups_idle); +#ifdef CONFIG_SCHED_WALT + P(ravg.demand); +#endif + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c004e3b89c32467526ad91574571c230e78cc7a9..9f70422acc7fa2697e9620eb6f4793f9b9c65f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,34 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "walt.h" + +#ifdef CONFIG_SCHED_WALT +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif + +#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH) +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq); +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +#else +static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {} +static inline void +walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} +static inline void +walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +#define walt_inc_throttled_cfs_rq_stats(...) +#define walt_dec_throttled_cfs_rq_stats(...) + +#endif /* * Targeted preemption latency for CPU-bound tasks: @@ -1559,7 +1587,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum @@ -3902,6 +3929,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return READ_ONCE(p->se.avg.util_avg); } @@ -3914,6 +3945,10 @@ static inline unsigned long _task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return max(task_util(p), _task_util_est(p)); } @@ -4826,13 +4861,16 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); + } /* * Note: distribution will already see us throttled via the @@ -4849,6 +4887,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4877,6 +4916,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -4891,7 +4931,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; - + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -4907,6 +4947,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); unthrottle_throttle: /* @@ -5470,8 +5511,6 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); @@ -5539,6 +5578,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5556,6 +5596,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5571,7 +5612,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - + inc_rq_walt_stats(rq, p); /* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the @@ -5633,6 +5674,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5662,6 +5704,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5671,6 +5714,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + dec_rq_walt_stats(rq, p); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -6193,6 +6237,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; + if (cpu_isolated(cpu)) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } @@ -6264,15 +6310,15 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + !cpu_isolated(target) && asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + ((available_idle_cpu(prev) || sched_idle_cpu(prev)) && + !cpu_isolated(target) && asym_fits_capacity(task_util, prev))) return prev; /* @@ -6382,11 +6428,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * * Return: the (estimated) utilization for the specified CPU */ -static inline unsigned long cpu_util(int cpu) +unsigned long cpu_util(int cpu) { struct cfs_rq *cfs_rq; unsigned int util; +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + u64 walt_cpu_util = + cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled; + + return min_t(unsigned long, walt_cpu_util, + capacity_orig_of(cpu)); + } +#endif + cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6414,10 +6470,29 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) struct cfs_rq *cfs_rq; unsigned int util; +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util) && + p->state == TASK_WAKING) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + util = max_t(long, cpu_util(cpu) - task_util(p), 0); + return min_t(unsigned long, util, capacity_orig_of(cpu)); + } +#endif + cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6523,6 +6598,18 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + /* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization @@ -8205,6 +8292,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) for_each_cpu(cpu, sched_group_span(sdg)) { unsigned long cpu_cap = capacity_of(cpu); + if (cpu_isolated(cpu)) + continue; + capacity += cpu_cap; min_capacity = min(cpu_cap, min_capacity); max_capacity = max(cpu_cap, max_capacity); @@ -8218,10 +8308,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { struct sched_group_capacity *sgc = group->sgc; - - capacity += sgc->capacity; - min_capacity = min(sgc->min_capacity, min_capacity); - max_capacity = max(sgc->max_capacity, max_capacity); + __maybe_unused cpumask_t *cpus = + sched_group_span(group); + + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, + min_capacity); + max_capacity = max(sgc->max_capacity, + max_capacity); + } group = group->next; } while (group != child->groups); } @@ -8429,6 +8525,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + if (cpu_isolated(i)) + continue; + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; @@ -8470,11 +8569,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_type = group_has_spare; + sgs->group_weight = group->group_weight; + return; + } + /* Check if dst CPU is idle and preferred to this group */ if (env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { sgs->group_asym_packing = 1; } @@ -9532,6 +9640,17 @@ static int need_active_balance(struct lb_env *env) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_WALT) +int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_span(sg), group_balance_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} +#endif /* CONFIG_SMP && CONFIG_SCHED_WALT */ + static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9555,7 +9674,7 @@ static int should_we_balance(struct lb_env *env) /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) || cpu_isolated(cpu)) continue; /* Are we the first idle CPU? */ @@ -9563,7 +9682,7 @@ static int should_we_balance(struct lb_env *env) } /* Are we the first CPU of this group ? */ - return group_balance_cpu(sg) == env->dst_cpu; + return group_balance_cpu_not_isolated(sg) == env->dst_cpu; } /* @@ -9765,7 +9884,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -9979,7 +10099,17 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + unsigned int available_cpus; +#ifdef CONFIG_SCHED_WALT + cpumask_t avail_mask; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); +#else + available_cpus = num_online_cpus(); +#endif + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -10109,6 +10239,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { +#ifdef CONFIG_SCHED_WALT + if (cpu_isolated(ilb)) + continue; +#endif if (idle_cpu(ilb)) return ilb; } @@ -10163,6 +10297,7 @@ static void nohz_balancer_kick(struct rq *rq) struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsigned int flags = 0; + cpumask_t cpumask; if (unlikely(rq->idle_balance)) return; @@ -10177,8 +10312,15 @@ static void nohz_balancer_kick(struct rq *rq) * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask); + if (cpumask_empty(&cpumask)) + return; +#else + cpumask_copy(&cpumask, nohz.idle_cpus_mask); if (likely(!atomic_read(&nohz.nr_cpus))) return; +#endif if (READ_ONCE(nohz.has_blocked) && time_after(now, READ_ONCE(nohz.next_blocked))) @@ -10214,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. */ - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + for_each_cpu_and(i, sched_domain_span(sd), &cpumask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; @@ -10392,6 +10534,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int balance_cpu; int ret = false; struct rq *rq; + cpumask_t cpus; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10411,7 +10554,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); +#else + cpumask_copy(&cpus, nohz.idle_cpus_mask); +#endif + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -10562,6 +10711,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we @@ -10675,6 +10827,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + /* + * Since core isolation doesn't update nohz.idle_cpus_mask, there + * is a possibility this nohz kicked cpu could be isolated. Hence + * return if the cpu is isolated. + */ + if (cpu_isolated(this_rq->cpu)) + return; + /* * If this CPU has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle CPUs whose ticks are @@ -10696,8 +10856,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -11269,6 +11431,9 @@ const struct sched_class fair_sched_class #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = walt_fixup_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG @@ -11321,6 +11486,96 @@ __init void init_sched_fair_class(void) } +/* WALT sched implementation begins here */ +#ifdef CONFIG_SCHED_WALT + +#ifdef CONFIG_CFS_BANDWIDTH + +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->walt_stats.cumulative_runnable_avg_scaled = 0; +} + +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + inc_nr_big_task(&cfs_rq->walt_stats, p); + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + p->ravg.demand_scaled); +} + +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + dec_nr_big_task(&cfs_rq->walt_stats, p); + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + -(s64)p->ravg.demand_scaled); +} + +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + +} + +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + /* + * We remove the throttled cfs_rq's tasks's contribution from the + * cumulative window demand so that the same can be added + * unconditionally when the cfs_rq is unthrottled. + */ + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); +} + +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->walt_stats, + task_load_delta); + walt_fixup_cum_window_demand(rq, task_load_delta); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + fixup_walt_sched_stats_common(rq, p, updated_demand_scaled); +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_SCHED_WALT */ + /* * Helper functions to facilitate extracting info from tracepoints. */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dae1e8eaa98329177da6affd4717042bd0826d54..6c1475950441419660a5d322250bedf868b2036f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include "pelt.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -264,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -1389,6 +1394,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1400,6 +1406,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } @@ -2276,7 +2283,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; rt_queue_pull_task(rq); @@ -2480,6 +2488,9 @@ const struct sched_class rt_sched_class #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 08db8e095e48f5bf507b8977697d4828ba73e934..503a7d147ac5a34abb05358f6e9336e80eba1d2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -87,6 +87,48 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_ravg_window; +extern unsigned int walt_cpu_util_freq_divisor; + +struct walt_sched_stats { + u64 cumulative_runnable_avg_scaled; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +#define NUM_TRACKED_WINDOWS 2 + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; +}; + +extern unsigned int sched_disable_window_stats; + +#endif /* CONFIG_SCHED_WALT */ + + /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -99,6 +141,10 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +#ifdef CONFIG_SMP +extern void init_sched_groups_capacity(int cpu, struct sched_domain *sd); +#endif + extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution @@ -594,6 +640,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + struct walt_sched_stats walt_stats; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@ -604,6 +654,9 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1008,6 +1061,25 @@ struct rq { u64 max_idle_balance_cost; #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_WALT + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct walt_sched_stats walt_stats; + + u64 window_start; + unsigned long walt_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cum_window_demand_scaled; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1837,6 +1909,10 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif +#ifdef CONFIG_SCHED_WALT + void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -2052,6 +2128,15 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifdef CONFIG_SCHED_WALT +u64 sched_ktime_clock(void); +#else +static inline u64 sched_ktime_clock(void) +{ + return sched_clock(); +} +#endif + #ifndef arch_scale_freq_tick static __always_inline void arch_scale_freq_tick(void) @@ -2077,7 +2162,14 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +unsigned long capacity_curr_of(int cpu); +unsigned long cpu_util(int cpu); + #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_disabled; +#endif #ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -2390,11 +2482,20 @@ DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; + u64 clock; + +#ifdef CONFIG_SCHED_WALT + if (!(flags & SCHED_CPUFREQ_WALT)) + return; + clock = sched_ktime_clock(); +#else + clock = rq_clock(rq); +#endif data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) - data->func(data, rq_clock(rq), flags); + data->func(data, clock, flags); } #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} @@ -2644,3 +2745,282 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_SCHED_WALT +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +extern struct list_head cluster_head; +extern struct sched_cluster *sched_cluster[NR_CPUS]; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +extern struct mutex policy_mutex; +extern unsigned int sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int __read_mostly sched_init_task_load_windows; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; +extern struct sched_cluster init_cluster; + +static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) +{ + rq->cum_window_demand_scaled += scaled_delta; + if (unlikely((s64)rq->cum_window_demand_scaled < 0)) + rq->cum_window_demand_scaled = 0; +} + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +extern void reset_task_stats(struct task_struct *p); + +#define CPU_RESERVED 1 +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_and_set_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return cluster->max_freq; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static inline void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_possible_cpu(i) { + if (!cpu_active(i)) + continue; + + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static inline int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +static inline int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return cpu_max_possible_capacity(cpu); +} + +static inline unsigned long cpu_util_freq_walt(int cpu) +{ + u64 util; + struct rq *rq = cpu_rq(cpu); + unsigned long capacity = capacity_orig_of(cpu); + + if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util)) + return cpu_util(cpu); + + util = rq->prev_runnable_sum << SCHED_CAPACITY_SHIFT; + util = div_u64(util, sched_ravg_window); + + return (util >= capacity) ? capacity : util; +} +#else /* CONFIG_SCHED_WALT */ + +static inline bool hmp_capable(void) { return false; } + +static inline void walt_fixup_cum_window_demand(struct rq *rq, + s64 scaled_delta) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline int is_reserved(int cpu) +{ + return 0; +} + +static inline void clear_reserved(int cpu) { } + +#endif /* CONFIG_SCHED_WALT */ + +struct sched_avg_stats { + int nr; + int nr_misfit; + int nr_max; + int nr_scaled; +}; +extern void sched_get_nr_running_avg(struct sched_avg_stats *stats); + +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern int group_balance_cpu_not_isolated(struct sched_group *sg); +#else +static inline int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + return group_balance_cpu(sg); +} +#endif /* CONFIG_SCHED_WALT */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks); +#ifdef CONFIG_SCHED_WALT +extern void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks); +extern void attach_tasks_core(struct list_head *tasks, struct rq *rq); +#else +static inline void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ +} +static inline void attach_tasks_core(struct list_head *tasks, struct rq *rq) {} +#endif +#endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 0000000000000000000000000000000000000000..be968752bd8b72cb14f40a4fbd81874311887c41 --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,194 @@ +/* Copyright (c) 2012, 2015-2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * Scheduler hook for average runqueue determination + */ +#include +#include +#include +#include +#include + +#include "sched.h" +#include "walt.h" +#include + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define NR_THRESHOLD_PCT 15 + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(struct sched_avg_stats *stats) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 period = curr_time - last_get_time; + u64 tmp_nr, tmp_misfit; + + if (!period) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + u64 diff; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_nr = per_cpu(nr_prod_sum, cpu); + tmp_nr += per_cpu(nr, cpu) * diff; + tmp_nr = div64_u64((tmp_nr * 100), period); + + tmp_misfit = per_cpu(nr_big_prod_sum, cpu); + tmp_misfit = div64_u64((tmp_misfit * 100), period); + + /* + * NR_THRESHOLD_PCT is to make sure that the task ran + * at least 85% in the last window to compensate any + * over estimating being done. + */ + stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), + 100); + stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + + NR_THRESHOLD_PCT), 100); + stats[cpu].nr_max = per_cpu(nr_max, cpu); + + trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, + stats[cpu].nr_misfit, stats[cpu].nr_max); + + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + last_get_time = curr_time; + +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > + capacity_orig_of(cpu)) + load_trigger = true; + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +/* + * Returns the CPU utilization % in the last window. + * + */ +unsigned int sched_get_cpu_util(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 util; + unsigned long capacity, flags; + unsigned int busy; + + raw_spin_lock_irqsave(&rq->lock, flags); + + util = rq->cfs.avg.util_avg; + capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = rq->prev_runnable_sum; + util = div64_u64(util, + sched_ravg_window >> SCHED_CAPACITY_SHIFT); + } +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + util = (util >= capacity) ? capacity : util; + busy = div64_ul((util * 100), capacity); + return busy; +} + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b125618eeda2a117efc1f805c62be28c13..ae43901c57afcc24967084998c72c2760ad74f5a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_SMP static int @@ -47,12 +48,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) @@ -133,4 +136,7 @@ const struct sched_class stop_sched_class .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 004e9505f7ad6e2f7eb8d7e688831c57772784a6..24f9f092a574c63b92650f8c5b298e88a56e1ff0 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1220,16 +1220,25 @@ build_sched_groups(struct sched_domain *sd, int cpu) * group having more cpu_capacity will pickup more load compared to the * group having less cpu_capacity. */ -static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) +void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_WALT + cpumask_t avail_mask; +#endif WARN_ON(!sg); do { int cpu, max_cpu = -1; +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&avail_mask, sched_group_span(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); +#else sg->group_weight = cpumask_weight(sched_group_span(sg)); +#endif if (!(sd->flags & SD_ASYM_PACKING)) goto next; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 0000000000000000000000000000000000000000..7ad74b58335836d6ec3b502b71d4523b21111a9e --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1740 @@ +/* + * walt.c + * + * Window Assistant Load Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include "sched.h" +#include "walt.h" +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 +#define SCHED_ACCOUNT_WAIT_TIME 1 + +static ktime_t ktime_last; +static bool sched_ktime_suspended; +DEFINE_MUTEX(cluster_lock); +static atomic64_t walt_irq_work_lastq_ws; +u64 walt_load_reported_window; + +static struct irq_work walt_cpufreq_irq_work; +static struct irq_work walt_migration_irq_work; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +static void acquire_rq_locks_irqsave(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + int level = 0; + + local_irq_save(*flags); + for_each_cpu(cpu, cpus) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } +} + +static void release_rq_locks_irqrestore(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + + for_each_cpu(cpu, cpus) + raw_spin_unlock(&cpu_rq(cpu)->lock); + local_irq_restore(*flags); +} + +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else +/* Min window size (in ns) = 20ms */ +#define MIN_SCHED_RAVG_WINDOW 20000000 +#endif + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +__read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +__read_mostly unsigned int sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; + +static __read_mostly unsigned int sched_io_is_busy = 1; + +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +unsigned int sysctl_sched_walt_init_task_load_pct = 15; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + +/* + * A after-boot constant divisor for cpu_util_freq_walt() to apply the load + * boost. + */ +__read_mostly unsigned int walt_cpu_util_freq_divisor; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sched_init_task_load_windows_scaled; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} +early_param("sched_ravg_window", set_sched_ravg_window); + +__read_mostly unsigned int walt_scale_demand_divisor; +#define scale_demand(d) ((d)/walt_scale_demand_divisor) + +void inc_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_inc_cumulative_runnable_avg(rq, p); +} + +void dec_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_dec_cumulative_runnable_avg(rq, p); +} + +void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + fixup_cumulative_runnable_avg(&rq->walt_stats, task_load_delta); + + walt_fixup_cum_window_demand(rq, task_load_delta); +} + +static u64 +update_window_start(struct rq *rq, u64 wallclock, int event) +{ + s64 delta; + int nr_windows; + u64 old_window_start = rq->window_start; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return old_window_start; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; + + rq->cum_window_demand_scaled = + rq->walt_stats.cumulative_runnable_avg_scaled; + + return old_window_start; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * The idle exit time is not accounted for the first task _picked_ up to + * run on the idle CPU. + */ + if (event == PICK_NEXT_TASK && rq->curr == rq->idle) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + bool new_task; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && p->last_sleep_ts >= + src_rq->window_start) { + walt_fixup_cum_window_demand(src_rq, + -(s64)p->ravg.demand_scaled); + walt_fixup_cum_window_demand(dest_rq, p->ravg.demand_scaled); + } + + new_task = is_new_task(p); + + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); + + if (!same_freq_domain(new_cpu, task_cpu(p))) + irq_work_queue(&walt_migration_irq_work); + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + + if (likely(rq->window_start)) + return; + + if (!sync_cpu_available) { + rq->window_start = 1; + sync_cpu_available = 1; + atomic64_set(&walt_irq_work_lastq_ws, rq->window_start); + walt_load_reported_window = + atomic64_read(&walt_irq_work_lastq_ws); + + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = sync_rq->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + u16 demand_scaled; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + demand_scaled = scale_demand(demand); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements walt stats + * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. + */ + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p) + && p->sched_class->fixup_walt_sched_stats) + p->sched_class->fixup_walt_sched_stats(rq, p, + demand_scaled); + else if (rq->curr == p) + walt_fixup_cum_window_demand(rq, demand_scaled); + } + + p->ravg.demand = demand; + p->ravg.demand_scaled = demand_scaled; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); + + delta = (delta * capcurr) >> SCHED_CAPACITY_SHIFT; + + return delta; +} + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + bool new_task; + int cpu = rq->cpu; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (!is_idle_task(p) && !exiting_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + if (p_is_curr_task && new_window) { + rollover_cpu_window(rq, full_window); + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * window then that is all that need be accounted. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + return; +} + +static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq) +{ + u64 result; + + if (old_window_start == rq->window_start) + return; + + result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start, + rq->window_start); + if (result == old_window_start) + irq_work_queue(&walt_cpufreq_irq_work); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 old_window_start; + + if (!rq->window_start || sched_disable_window_stats || + p->ravg.mark_start == wallclock) + return; + + lockdep_assert_held(&rq->lock); + + old_window_start = update_window_start(rq, wallclock, event); + + if (!p->ravg.mark_start) { + goto done; + } + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime); + + if (exiting_task(p)) + goto done; + +done: + p->ravg.mark_start = wallclock; + + run_walt_irq_work(old_window_start, rq); +} + +int sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec(table, write, buffer, length, ppos); + if (rc) + return rc; + + sysctl_sched_init_task_load_pct = sysctl_sched_walt_init_task_load_pct; + + return 0; +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; + u32 init_load_pct = current->init_load_pct; + + p->last_sleep_ts = 0; + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + init_load_windows_scaled = scale_demand(init_load_windows); + } + + p->ravg.demand = init_load_windows; + p->ravg.demand_scaled = init_load_windows_scaled; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +void free_task_load_ptrs(struct task_struct *p) +{ + kfree(p->ravg.curr_window_cpu); + kfree(p->ravg.prev_window_cpu); + + /* + * update_task_ravg() can be called for exiting tasks. While the + * function itself ensures correct behavior, the corresponding + * trace event requires that these pointers be NULL. + */ + p->ravg.curr_window_cpu = NULL; + p->ravg.prev_window_cpu = NULL; +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; + + if (exiting_task(p)) { + sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } + + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; +unsigned int max_power_cost = 1; + +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +struct list_head cluster_head; + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + pr_warn("Cluster allocation failed. Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->freq_init_done = false; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + cluster->efficiency = topology_get_cpu_scale(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +void walt_update_min_max_capacity(void) +{ + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int +compare_clusters(void *priv, const struct list_head *a, const struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + unsigned int tmp_max = 1; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + + if (cluster->max_power_cost > tmp_max) + tmp_max = cluster->max_power_cost; + } + max_power_cost = tmp_max; + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); +} + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .min_freq = 1, + .max_possible_freq = 1, + .exec_scale_factor = 1024, +}; + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); +} + +static unsigned long cpu_max_table_freq[NR_CPUS]; + +void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + unsigned long flags; + + cpumask_copy(&cpumask, cpus); + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + } + + __update_min_max_capacity(); + + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_CREATE_POLICY) + return 0; + + walt_update_min_max_capacity(); + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) + cpu_max_table_freq[i] = policy->cpuinfo.max_freq; + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->policy->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_DONE; + + if (cpu_cur_freq(cpu) == new_freq) + return NOTIFY_OK; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return NOTIFY_OK; +} + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_walt_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_walt_callback); + +/* + * Runs in hard-irq context. This should ideally run just after the latest + * window roll-over. + */ +void walt_irq_work(struct irq_work *irq_work) +{ + struct sched_cluster *cluster; + struct rq *rq; + int cpu; + u64 wc; + bool is_migration = false; + int level = 0; + + /* Am I the window rollover work or the migration work? */ + if (irq_work == &walt_migration_irq_work) + is_migration = true; + + for_each_cpu(cpu, cpu_possible_mask) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } + + wc = sched_ktime_clock(); + walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws); + for_each_sched_cluster(cluster) { + raw_spin_lock(&cluster->load_lock); //TODO: rtg + + for_each_cpu(cpu, &cluster->cpus) { + rq = cpu_rq(cpu); + if (rq->curr) { + update_task_ravg(rq->curr, rq, + TASK_UPDATE, wc, 0); + account_load_subtractions(rq); + } + } + + raw_spin_unlock(&cluster->load_lock); + } + + for_each_sched_cluster(cluster) { + cpumask_t cluster_online_cpus; + unsigned int num_cpus, i = 1; + + cpumask_and(&cluster_online_cpus, &cluster->cpus, + cpu_online_mask); + num_cpus = cpumask_weight(&cluster_online_cpus); + for_each_cpu(cpu, &cluster_online_cpus) { + int flag = SCHED_CPUFREQ_WALT; + + rq = cpu_rq(cpu); + + if (i == num_cpus) + cpufreq_update_util(cpu_rq(cpu), flag); + else + cpufreq_update_util(cpu_rq(cpu), flag | + SCHED_CPUFREQ_CONTINUE); + i++; + } + } + + for_each_cpu(cpu, cpu_possible_mask) + raw_spin_unlock(&cpu_rq(cpu)->lock); + + //if (!is_migration) + //core_ctl_check(this_rq()->window_start); +} + +static void walt_init_once(void) +{ + init_irq_work(&walt_migration_irq_work, walt_irq_work); + init_irq_work(&walt_cpufreq_irq_work, walt_irq_work); + + walt_cpu_util_freq_divisor = + (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100; + walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT; + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + sched_init_task_load_windows_scaled = + scale_demand(sched_init_task_load_windows); +} + +void walt_sched_init_rq(struct rq *rq) +{ + static bool init; + int j; + + if (!init) { + walt_init_once(); + init = true; + } + + cpumask_set_cpu(cpu_of(rq), &rq->freq_domain_cpumask); + + rq->walt_stats.cumulative_runnable_avg_scaled = 0; + rq->window_start = 0; + rq->walt_flags = 0; + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + rq->cum_window_demand_scaled = 0; + + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) { + memset(&rq->load_subs[j], 0, sizeof(struct load_subtractions)); + } +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 0000000000000000000000000000000000000000..e22961349e9c81a9b41f4597f9fd18857bf423ac --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,236 @@ +/* + * walt.h + * + * head file for Window-Assistant-Load-Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +#include + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +#define SCHED_NEW_TASK_WINDOWS 5 + +extern unsigned int sched_ravg_window; +extern unsigned int sysctl_sched_walt_init_task_load_pct; + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; +} + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +static inline void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +extern void reset_task_stats(struct task_struct *p); +extern void update_cluster_topology(void); +extern void init_clusters(void); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); + +static inline void +fixup_cumulative_runnable_avg(struct walt_sched_stats *stats, + s64 demand_scaled_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg_scaled += demand_scaled_delta; + BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0); +} + +static inline void +walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled); + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled); +} + +static inline void +walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, + -(s64)p->ravg.demand_scaled); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled); +} +extern void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +extern void inc_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void dec_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void init_new_task_load(struct task_struct *p); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +void account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock); + +void walt_irq_work(struct irq_work *irq_work); + +void walt_sched_init_rq(struct rq *rq); + +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +#else /* CONFIG_SCHED_WALT */ +static inline void walt_sched_init_rq(struct rq *rq) { } + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void +inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void init_new_task_load(struct task_struct *p) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void update_cluster_topology(void) { } +static inline void init_clusters(void) { } + +static inline void +fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) { } + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) { } + +static inline u64 sched_irqload(int cpu) +{ + return 0; +} +static inline int sched_cpu_high_irqload(int cpu) +{ + return 0; +} +#endif /* CONFIG_SCHED_WALT */ + +#endif /* __WALT_H */ diff --git a/kernel/smp.c b/kernel/smp.c index f73a597c8e4cf44bab200d61323e8eb2ca6e5e4f..92742aa1e3480e85830acb62421ba3d73ce0c03a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -957,7 +957,8 @@ void wake_up_all_idle_cpus(void) if (cpu == smp_processor_id()) continue; - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 890b79cf0e7c311c94e1ce5aa882a5f7f7d4e335..51c1fe80a9f64009767160537d77c416c8b39733 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ -static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1d8b4358aa11e60e534305c9835f4f9b96ae5fba..f13b9e456f50188be4e9e003f114a9a66d002ea8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,40 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_walt_init_task_load_pct_sysctl_handler, + }, + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ef90718c1146dfe0f875650bd3750dc9523b8aa..30abe5f4ce4ebfde52a92e0eff9a970cda1402ad 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1004,12 +1004,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -1024,19 +1021,25 @@ static void __remove_hrtimer(struct hrtimer *timer, */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + WRITE_ONCE(timer->state, newstate | (timer->state & HRTIMER_STATE_PINNED)); } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - bool restart, bool keep_local) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - bool reprogram; + int reprogram; /* * Remove the timer and force reprogramming when high @@ -1049,18 +1052,11 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * If the timer is not restarted then reprogramming is - * required if the timer is local. If it is local and about - * to be restarted, avoid programming it twice (on removal - * and a moment later when it's requeued). - */ if (!restart) state = HRTIMER_STATE_INACTIVE; - else - reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1112,31 +1108,9 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; - bool force_local, first; - /* - * If the timer is on the local cpu base and is the first expiring - * timer then this might end up reprogramming the hardware twice - * (on removal and on enqueue). To avoid that by prevent the - * reprogram on removal, keep the timer local to the current CPU - * and enforce reprogramming after it is queued no matter whether - * it is the new first expiring timer again or not. - */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - force_local &= base->cpu_base->next_timer == timer; - - /* - * Remove an active timer from the queue. In case it is not queued - * on the current CPU, make sure that remove_hrtimer() updates the - * remote data correctly. - * - * If it's on the current CPU and the first expiring timer, then - * skip reprogramming, keep the timer local and enforce - * reprogramming later if it was the first expiring timer. This - * avoids programming the underlying clock event twice (once at - * removal and once after enqueue). - */ - remove_hrtimer(timer, base, true, force_local); + /* Remove an active timer from the queue: */ + remove_hrtimer(timer, base, true); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1146,24 +1120,13 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - if (!force_local) { - new_base = switch_hrtimer_base(timer, base, - mode & HRTIMER_MODE_PINNED); - } else { - new_base = base; - } + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; - /* - * Timer was forced to stay on the current CPU to avoid - * reprogramming on removal and enqueue. Force reprogram the - * hardware by evaluating the new first expiring timer. - */ - hrtimer_force_reprogram(new_base->cpu_base, 1); - return 0; + return enqueue_hrtimer(timer, new_base, mode); } /** @@ -1229,7 +1192,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false, false); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); @@ -2083,14 +2046,21 @@ int hrtimers_prepare_cpu(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + bool is_hotplug = !cpu_online(old_base->cpu_base->cpu); + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); - BUG_ON(hrtimer_callback_running(timer)); + if (is_hotplug) + BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* @@ -2099,6 +2069,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -2110,23 +2087,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base, HRTIMER_MODE_ABS); + } } -int hrtimers_dead_cpu(unsigned int scpu) +static void __migrate_hrtimers(unsigned int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -2138,7 +2115,7 @@ int hrtimers_dead_cpu(unsigned int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } /* @@ -2152,11 +2129,30 @@ int hrtimers_dead_cpu(unsigned int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} + +int hrtimers_dead_cpu(unsigned int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); + __migrate_hrtimers(scpu, true); local_bh_enable(); return 0; } +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); +} + #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a3ec21be3b140f3cc4c54be07495d1b8f799b4fe..b09a20c2502d79023c7b3d4bfa00558e4428f4c9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1921,14 +1921,20 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } @@ -1949,14 +1955,13 @@ int timers_prepare_cpu(unsigned int cpu) return 0; } -int timers_dead_cpu(unsigned int cpu) +static void __migrate_timers(unsigned int cpu, bool remove_pinned) { struct timer_base *old_base; struct timer_base *new_base; + unsigned long flags; int b, i; - BUG_ON(cpu_online(cpu)); - for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); @@ -1964,7 +1969,7 @@ int timers_dead_cpu(unsigned int cpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock_irq(&new_base->lock); + raw_spin_lock_irqsave(&new_base->lock, flags); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); /* @@ -1973,18 +1978,31 @@ int timers_dead_cpu(unsigned int cpu) */ forward_timer_base(new_base); - BUG_ON(old_base->running_timer); + if (!cpu_online(cpu)) + BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) - migrate_timer_list(new_base, old_base->vectors + i); + migrate_timer_list(new_base, old_base->vectors + i, + remove_pinned); raw_spin_unlock(&old_base->lock); - raw_spin_unlock_irq(&new_base->lock); + raw_spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&timer_bases); } +} + +int timers_dead_cpu(unsigned int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, true); return 0; } +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(unsigned int *)cpup, false); +} + #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 01bf977090dc2e699b105dd7c7e0c254d98ca95a..e0a05a779f7cb305c10ae7b8f124f2b142d584cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -171,7 +172,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); -static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(unsigned int, watchdog_en);static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -428,16 +429,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_enable(unsigned int cpu) +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); + unsigned int *enabled = this_cpu_ptr(&watchdog_en); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); + if (*enabled) + return; + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -452,11 +457,24 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); + + /* + * Need to ensure above operations are observed by other CPUs before + * indicating that timer is enabled. This is to synchronize core + * isolation and hotplug. Core isolation will wait for this flag to be + * set. + */ + mb(); + *enabled = 1; } -static void watchdog_disable(unsigned int cpu) +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = per_cpu_ptr(&watchdog_en, cpu); + + if (!*enabled) + return; WARN_ON_ONCE(cpu != smp_processor_id()); @@ -468,6 +486,17 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); + + /* + * No need for barrier here since disabling the watchdog is + * synchronized with hotplug lock + */ + *enabled = 0; +} + +bool watchdog_configured(unsigned int cpu) +{ + return *per_cpu_ptr(&watchdog_en, cpu); } static int softlockup_stop_fn(void *data) @@ -482,7 +511,6 @@ static void softlockup_stop_all(void) if (!softlockup_initialized) return; - for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d146942151348bac4012dea31b09bb..15560b6b7704ec64faa5901fd462eb09ce27e669 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1863,7 +1863,7 @@ int vmstat_refresh(struct ctl_table *table, int write, static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1955,7 +1955,8 @@ static void vmstat_shepherd(struct work_struct *w) for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!delayed_work_pending(dw) && need_update(cpu) && + !cpu_isolated(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus();