From 474473998bc5424780b8658f6b004d01d8bd5fdd Mon Sep 17 00:00:00 2001 From: Hu Zhaodong Date: Thu, 9 Dec 2021 11:49:23 +0800 Subject: [PATCH 1/2] walt basic for openHarmony --- fs/proc/base.c | 68 ++ include/linux/sched.h | 66 ++ include/linux/sched/cpufreq.h | 2 + include/linux/sched/sysctl.h | 10 + include/trace/events/walt.h | 170 +++ init/Kconfig | 9 + kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 92 ++ kernel/sched/cpufreq_schedutil.c | 7 +- kernel/sched/cputime.c | 15 + kernel/sched/deadline.c | 6 + kernel/sched/debug.c | 21 + kernel/sched/fair.c | 192 +++- kernel/sched/rt.c | 6 + kernel/sched/sched.h | 333 +++++- kernel/sched/stop_task.c | 6 + kernel/sched/walt.c | 1740 ++++++++++++++++++++++++++++++ kernel/sched/walt.h | 236 ++++ kernel/sysctl.c | 34 + 21 files changed, 3007 insertions(+), 9 deletions(-) create mode 100644 include/trace/events/walt.h create mode 100644 kernel/sched/walt.c create mode 100644 kernel/sched/walt.h diff --git a/fs/proc/base.c b/fs/proc/base.c index 9b3038f1b9b5..1e12fc895101 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include @@ -1573,6 +1574,70 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_SCHED_WALT +static int sched_init_task_load_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + seq_printf(m, "%d\n", sched_get_init_task_load(p)); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_init_task_load_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int init_task_load, err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &init_task_load); + if (err) + goto out; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = sched_set_init_task_load(p, init_task_load); + + put_task_struct(p); + +out: + return err < 0 ? err : count; +} + +static int sched_init_task_load_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_init_task_load_show, inode); +} + +static const struct file_operations proc_pid_sched_init_task_load_operations = { + .open = sched_init_task_load_open, + .read = seq_read, + .write = sched_init_task_load_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { @@ -3199,6 +3264,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_WALT + REG("sched_init_task_load", 00644, proc_pid_sched_init_task_load_operations), +#endif #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 53198ac3d154..194b96b59d67 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -213,6 +213,15 @@ struct io_uring_task; /* Task command name length: */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -495,6 +504,53 @@ struct sched_entity { #endif }; +#ifdef CONFIG_SCHED_WALT +extern void sched_exit(struct task_struct *p); +extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct); +extern u32 sched_get_init_task_load(struct task_struct *p); +extern void free_task_load_ptrs(struct task_struct *p); +#define RAVG_HIST_SIZE_MAX 5 +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the current window + * + * 'prev_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the previous window + * + * 'curr_window' represents the sum of all entries in curr_window_cpu + * + * 'prev_window' represents the sum of all entries in prev_window_cpu + * + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 *curr_window_cpu, *prev_window_cpu; + u32 curr_window, prev_window; + u16 active_windows; + u16 demand_scaled; +}; +#else +static inline void sched_exit(struct task_struct *p) { } +static inline void free_task_load_ptrs(struct task_struct *p) { } +#endif /* CONFIG_SCHED_WALT */ + struct sched_rt_entity { struct list_head run_list; unsigned long timeout; @@ -700,6 +756,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; + u64 last_sleep_ts; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 3ed5aa18593f..c7cf63236f5b 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,6 +9,8 @@ */ #define SCHED_CPUFREQ_IOWAIT (1U << 0) +#define SCHED_CPUFREQ_WALT (1U << 1) +#define SCHED_CPUFREQ_CONTINUE (1U << 2) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..210909cd4141 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -30,6 +30,16 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_cpu_high_irqload; + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/trace/events/walt.h b/include/trace/events/walt.h new file mode 100644 index 000000000000..603889af1de2 --- /dev/null +++ b/include/trace/events/walt.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM walt + +#if !defined(_TRACE_WALT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WALT_H + +#include +#include + +struct rq; +extern const char *task_event_names[]; + +#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_WALT) +static inline void __window_data(u32 *dst, u32 *src) +{ + if (src) + memcpy(dst, src, nr_cpu_ids * sizeof(u32)); + else + memset(dst, 0, nr_cpu_ids * sizeof(u32)); +} + +struct trace_seq; +const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + seq_buf_used(&p->seq); + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%u ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} + +static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new) +{ + if (curr) + if (new) + return rq->nt_curr_runnable_sum; + else + return rq->curr_runnable_sum; + else + if (new) + return rq->nt_prev_runnable_sum; + else + return rq->prev_runnable_sum; +} + +#endif + +TRACE_EVENT(sched_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + enum task_event evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, runtime ) + __field( int, samples ) + __field(enum task_event, evt ) + __field(unsigned int, demand ) + __array( u32, hist, RAVG_HIST_SIZE_MAX) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %s demand %u" + " (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, + task_event_names[__entry->evt], __entry->demand, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(sched_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) + __field( u64, wallclock ) + __field( u64, mark_start ) + __field( u64, delta_m ) + __field( u64, win_start ) + __field( u64, delta ) + __field( u64, irqtime ) + __field(enum task_event, evt ) + __field(unsigned int, demand ) + __field(unsigned int, sum ) + __field( int, cpu ) + __field( u64, rq_cs ) + __field( u64, rq_ps ) + __field( u32, curr_window ) + __field( u32, prev_window ) + __dynamic_array(u32, curr_sum, nr_cpu_ids ) + __dynamic_array(u32, prev_sum, nr_cpu_ids ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cluster->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->rq_cs = rq->curr_runnable_sum; + __entry->rq_ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu); + __window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu); + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u", + __entry->wallclock, __entry->win_start, __entry->delta, + task_event_names[__entry->evt], __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->rq_cs, __entry->rq_ps, __entry->curr_window, + __window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids), + __entry->prev_window, + __window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids), + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows) +); + +#endif /* _TRACE_WALT_H */ + +/* This part must be outside protection */ +#include diff --git a/init/Kconfig b/init/Kconfig index fc4c9f416fad..8b20edacf921 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -526,6 +526,15 @@ config SCHED_THERMAL_PRESSURE This requires the architecture to implement arch_set_thermal_pressure() and arch_get_thermal_pressure(). +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/exit.c b/kernel/exit.c index d13d67fc5f4e..795e16ecc422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -765,6 +765,7 @@ void __noreturn do_exit(long code) io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ + sched_exit(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/fork.c b/kernel/fork.c index 39b1783a7613..7528c3f3736e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2369,6 +2369,7 @@ static __latent_entropy struct task_struct *copy_process( perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); + free_task_load_ptrs(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..75ab238bde9d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -27,6 +27,7 @@ obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4551d1736fa..41072cfe0c57 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include "pelt.h" #include "smp.h" +#include "walt.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -2007,6 +2008,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -2794,6 +2796,26 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * accesses to the task state; see try_to_wake_up() and set_current_state(). */ +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +/* utility function to update walt signals at wakeup */ +static inline void walt_try_to_wake_up(struct task_struct *p) +{ + struct rq *rq = cpu_rq(task_cpu(p)); + struct rq_flags rf; + u64 wallclock; + + rq_lock_irqsave(rq, &rf); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + rq_unlock_irqrestore(rq, &rf); +} +#else +#define walt_try_to_wake_up(a) {} +#endif +#endif + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -2928,6 +2950,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_acquire__after_ctrl_dep(); + walt_try_to_wake_up(p); + /* * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq * == 0), which means we need to do an enqueue, change p->state to @@ -3233,6 +3257,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; + init_new_task_load(p); __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -3363,6 +3388,8 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); + mark_task_starting(p); + activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3995,6 +4022,7 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + u64 wallclock; unsigned long thermal_pressure; arch_scale_freq_tick(); @@ -4002,6 +4030,9 @@ void scheduler_tick(void) rq_lock(rq, &rf); + set_window_start(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_rq_clock(rq); thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); @@ -4423,6 +4454,7 @@ static void __sched notrace __schedule(bool preempt) struct rq_flags rf; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -4505,7 +4537,13 @@ static void __sched notrace __schedule(bool preempt) clear_tsk_need_resched(prev); clear_preempt_need_resched(); + wallclock = sched_ktime_clock(); if (likely(prev != next)) { + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; + + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -4535,6 +4573,7 @@ static void __sched notrace __schedule(bool preempt) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } @@ -6949,6 +6988,11 @@ int sched_cpu_deactivate(unsigned int cpu) static void sched_rq_cpu_starting(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; update_max_interval(); @@ -6971,6 +7015,7 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -7000,6 +7045,8 @@ void __init sched_init_smp(void) sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + update_cluster_topology(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); @@ -7062,6 +7109,8 @@ void __init sched_init(void) wait_bit_init(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7173,6 +7222,7 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + walt_sched_init_rq(rq); INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7203,6 +7253,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -8481,3 +8532,44 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_WALT +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + u64 wallclock; + + rq = task_rq_lock(p, &rf); + + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + /* + * task's contribution is already removed from the + * cumulative window demand in dequeue. As the + * task's stats are reset, the next enqueue does + * not change the cumulative window demand. + */ + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &rf); + free_task_load_ptrs(p); +} +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae086..5bc0dca90f7b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -287,6 +287,10 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); +#ifdef CONFIG_SCHED_WALT + return cpu_util_freq_walt(sg_cpu->cpu); +#endif + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); } @@ -520,7 +524,8 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) ignore_dl_rate_limit(sg_cpu, sg_policy); - if (sugov_should_update_freq(sg_policy, time)) { + if (sugov_should_update_freq(sg_policy, time) && + !(flags & SCHED_CPUFREQ_CONTINUE)) { next_f = sugov_next_freq_shared(sg_cpu, time); if (sg_policy->policy->fast_switch_enabled) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d2300452..cf87d3fff5dd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@ * Simple CPU accounting cgroup controller */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -52,11 +53,18 @@ void irqtime_account_irq(struct task_struct *curr) struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; @@ -70,6 +78,13 @@ void irqtime_account_irq(struct task_struct *curr) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); +#ifdef CONFIG_SCHED_WALT + else + account = false; + + if (account) + sched_account_irqtime(cpu, curr, delta, wallclock); +#endif } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8255267ce323..2a64cced37a5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ */ #include "sched.h" #include "pelt.h" +#include "walt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1440,6 +1441,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1454,6 +1456,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -2547,6 +2550,9 @@ const struct sched_class dl_sched_class .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; int sched_dl_global_validate(void) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 70a578272436..e5af311230be 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -715,6 +715,17 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); +#ifdef CONFIG_SCHED_WALT + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + SEQ_printf(m, " .%-30s: %llu\n", "walt_stats.cumulative_runnable_avg", + rq->walt_stats.cumulative_runnable_avg_scaled); +#endif #undef P #undef PN @@ -791,6 +802,12 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_WALT + P(sched_init_task_load_windows); + P(min_capacity); + P(max_capacity); + P(sched_ravg_window); +#endif #undef PN #undef P @@ -983,6 +1000,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(se.statistics.nr_wakeups_passive); P_SCHEDSTAT(se.statistics.nr_wakeups_idle); +#ifdef CONFIG_SCHED_WALT + P(ravg.demand); +#endif + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c004e3b89c32..fc8529ccb6a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,34 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "walt.h" + +#ifdef CONFIG_SCHED_WALT +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif + +#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH) +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq); +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +#else +static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {} +static inline void +walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} +static inline void +walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +#define walt_inc_throttled_cfs_rq_stats(...) +#define walt_dec_throttled_cfs_rq_stats(...) + +#endif /* * Targeted preemption latency for CPU-bound tasks: @@ -1559,7 +1587,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum @@ -3902,6 +3929,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return READ_ONCE(p->se.avg.util_avg); } @@ -3914,6 +3945,10 @@ static inline unsigned long _task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return max(task_util(p), _task_util_est(p)); } @@ -4826,13 +4861,16 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); + } /* * Note: distribution will already see us throttled via the @@ -4849,6 +4887,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4877,6 +4916,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -4891,7 +4931,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; - + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -4907,6 +4947,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); unthrottle_throttle: /* @@ -5470,8 +5511,6 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); @@ -5539,6 +5578,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5556,6 +5596,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5571,7 +5612,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - + inc_rq_walt_stats(rq, p); /* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the @@ -5633,6 +5674,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5662,6 +5704,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5671,6 +5714,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + dec_rq_walt_stats(rq, p); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -6382,11 +6426,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * * Return: the (estimated) utilization for the specified CPU */ -static inline unsigned long cpu_util(int cpu) +unsigned long cpu_util(int cpu) { struct cfs_rq *cfs_rq; unsigned int util; +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + u64 walt_cpu_util = + cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled; + + return min_t(unsigned long, walt_cpu_util, + capacity_orig_of(cpu)); + } +#endif + cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6414,10 +6468,29 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) struct cfs_rq *cfs_rq; unsigned int util; +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util) && + p->state == TASK_WAKING) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + util = max_t(long, cpu_util(cpu) - task_util(p), 0); + return min_t(unsigned long, util, capacity_orig_of(cpu)); + } +#endif + cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6523,6 +6596,18 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + /* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization @@ -11269,6 +11354,9 @@ const struct sched_class fair_sched_class #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = walt_fixup_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG @@ -11321,6 +11409,96 @@ __init void init_sched_fair_class(void) } +/* WALT sched implementation begins here */ +#ifdef CONFIG_SCHED_WALT + +#ifdef CONFIG_CFS_BANDWIDTH + +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->walt_stats.cumulative_runnable_avg_scaled = 0; +} + +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + inc_nr_big_task(&cfs_rq->walt_stats, p); + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + p->ravg.demand_scaled); +} + +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + dec_nr_big_task(&cfs_rq->walt_stats, p); + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + -(s64)p->ravg.demand_scaled); +} + +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + +} + +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + /* + * We remove the throttled cfs_rq's tasks's contribution from the + * cumulative window demand so that the same can be added + * unconditionally when the cfs_rq is unthrottled. + */ + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); +} + +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->walt_stats, + task_load_delta); + walt_fixup_cum_window_demand(rq, task_load_delta); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + fixup_walt_sched_stats_common(rq, p, updated_demand_scaled); +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_SCHED_WALT */ + /* * Helper functions to facilitate extracting info from tracepoints. */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dae1e8eaa983..5938cf2e421b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include "pelt.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -1389,6 +1390,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1400,6 +1402,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } @@ -2480,6 +2483,9 @@ const struct sched_class rt_sched_class #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 08db8e095e48..21aa5b081b96 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -87,6 +87,48 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_ravg_window; +extern unsigned int walt_cpu_util_freq_divisor; + +struct walt_sched_stats { + u64 cumulative_runnable_avg_scaled; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +#define NUM_TRACKED_WINDOWS 2 + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; +}; + +extern unsigned int sched_disable_window_stats; + +#endif /* CONFIG_SCHED_WALT */ + + /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -594,6 +636,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + struct walt_sched_stats walt_stats; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@ -604,6 +650,9 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1008,6 +1057,25 @@ struct rq { u64 max_idle_balance_cost; #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_WALT + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct walt_sched_stats walt_stats; + + u64 window_start; + unsigned long walt_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cum_window_demand_scaled; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1837,6 +1905,10 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif +#ifdef CONFIG_SCHED_WALT + void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -2052,6 +2124,15 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifdef CONFIG_SCHED_WALT +u64 sched_ktime_clock(void); +#else +static inline u64 sched_ktime_clock(void) +{ + return sched_clock(); +} +#endif + #ifndef arch_scale_freq_tick static __always_inline void arch_scale_freq_tick(void) @@ -2077,7 +2158,14 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +unsigned long capacity_curr_of(int cpu); +unsigned long cpu_util(int cpu); + #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_disabled; +#endif #ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -2390,11 +2478,20 @@ DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; + u64 clock; +#ifdef CONFIG_SCHED_WALT + if (!(flags & SCHED_CPUFREQ_WALT)) + return; + + clock = sched_ktime_clock(); +#else + clock = rq_clock(rq); +#endif data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) - data->func(data, rq_clock(rq), flags); + data->func(data, clock, flags); } #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} @@ -2644,3 +2741,237 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_SCHED_WALT +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +extern struct list_head cluster_head; +extern struct sched_cluster *sched_cluster[NR_CPUS]; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +extern struct mutex policy_mutex; +extern unsigned int sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int __read_mostly sched_init_task_load_windows; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; +extern struct sched_cluster init_cluster; + +static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) +{ + rq->cum_window_demand_scaled += scaled_delta; + if (unlikely((s64)rq->cum_window_demand_scaled < 0)) + rq->cum_window_demand_scaled = 0; +} + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +extern void reset_task_stats(struct task_struct *p); + +#define CPU_RESERVED 1 +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_and_set_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return cluster->max_freq; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static inline void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_possible_cpu(i) { + if (!cpu_active(i)) + continue; + + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static inline int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +static inline int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return cpu_max_possible_capacity(cpu); +} + +static inline unsigned long cpu_util_freq_walt(int cpu) +{ + u64 util; + struct rq *rq = cpu_rq(cpu); + unsigned long capacity = capacity_orig_of(cpu); + + if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util)) + return cpu_util(cpu); + + util = rq->prev_runnable_sum << SCHED_CAPACITY_SHIFT; + util = div_u64(util, sched_ravg_window); + + return (util >= capacity) ? capacity : util; +} +#else /* CONFIG_SCHED_WALT */ +static inline void walt_fixup_cum_window_demand(struct rq *rq, + s64 scaled_delta) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline int is_reserved(int cpu) +{ + return 0; +} + +static inline void clear_reserved(int cpu) { } + +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..ae43901c57af 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_SMP static int @@ -47,12 +48,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) @@ -133,4 +136,7 @@ const struct sched_class stop_sched_class .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 000000000000..7ad74b583358 --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1740 @@ +/* + * walt.c + * + * Window Assistant Load Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include "sched.h" +#include "walt.h" +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 +#define SCHED_ACCOUNT_WAIT_TIME 1 + +static ktime_t ktime_last; +static bool sched_ktime_suspended; +DEFINE_MUTEX(cluster_lock); +static atomic64_t walt_irq_work_lastq_ws; +u64 walt_load_reported_window; + +static struct irq_work walt_cpufreq_irq_work; +static struct irq_work walt_migration_irq_work; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +static void acquire_rq_locks_irqsave(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + int level = 0; + + local_irq_save(*flags); + for_each_cpu(cpu, cpus) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } +} + +static void release_rq_locks_irqrestore(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + + for_each_cpu(cpu, cpus) + raw_spin_unlock(&cpu_rq(cpu)->lock); + local_irq_restore(*flags); +} + +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else +/* Min window size (in ns) = 20ms */ +#define MIN_SCHED_RAVG_WINDOW 20000000 +#endif + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +__read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +__read_mostly unsigned int sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; + +static __read_mostly unsigned int sched_io_is_busy = 1; + +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +unsigned int sysctl_sched_walt_init_task_load_pct = 15; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + +/* + * A after-boot constant divisor for cpu_util_freq_walt() to apply the load + * boost. + */ +__read_mostly unsigned int walt_cpu_util_freq_divisor; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sched_init_task_load_windows_scaled; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} +early_param("sched_ravg_window", set_sched_ravg_window); + +__read_mostly unsigned int walt_scale_demand_divisor; +#define scale_demand(d) ((d)/walt_scale_demand_divisor) + +void inc_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_inc_cumulative_runnable_avg(rq, p); +} + +void dec_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_dec_cumulative_runnable_avg(rq, p); +} + +void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + fixup_cumulative_runnable_avg(&rq->walt_stats, task_load_delta); + + walt_fixup_cum_window_demand(rq, task_load_delta); +} + +static u64 +update_window_start(struct rq *rq, u64 wallclock, int event) +{ + s64 delta; + int nr_windows; + u64 old_window_start = rq->window_start; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return old_window_start; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; + + rq->cum_window_demand_scaled = + rq->walt_stats.cumulative_runnable_avg_scaled; + + return old_window_start; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * The idle exit time is not accounted for the first task _picked_ up to + * run on the idle CPU. + */ + if (event == PICK_NEXT_TASK && rq->curr == rq->idle) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + bool new_task; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && p->last_sleep_ts >= + src_rq->window_start) { + walt_fixup_cum_window_demand(src_rq, + -(s64)p->ravg.demand_scaled); + walt_fixup_cum_window_demand(dest_rq, p->ravg.demand_scaled); + } + + new_task = is_new_task(p); + + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); + + if (!same_freq_domain(new_cpu, task_cpu(p))) + irq_work_queue(&walt_migration_irq_work); + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + + if (likely(rq->window_start)) + return; + + if (!sync_cpu_available) { + rq->window_start = 1; + sync_cpu_available = 1; + atomic64_set(&walt_irq_work_lastq_ws, rq->window_start); + walt_load_reported_window = + atomic64_read(&walt_irq_work_lastq_ws); + + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = sync_rq->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + u16 demand_scaled; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + demand_scaled = scale_demand(demand); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements walt stats + * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. + */ + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p) + && p->sched_class->fixup_walt_sched_stats) + p->sched_class->fixup_walt_sched_stats(rq, p, + demand_scaled); + else if (rq->curr == p) + walt_fixup_cum_window_demand(rq, demand_scaled); + } + + p->ravg.demand = demand; + p->ravg.demand_scaled = demand_scaled; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); + + delta = (delta * capcurr) >> SCHED_CAPACITY_SHIFT; + + return delta; +} + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + bool new_task; + int cpu = rq->cpu; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (!is_idle_task(p) && !exiting_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + if (p_is_curr_task && new_window) { + rollover_cpu_window(rq, full_window); + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * window then that is all that need be accounted. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + return; +} + +static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq) +{ + u64 result; + + if (old_window_start == rq->window_start) + return; + + result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start, + rq->window_start); + if (result == old_window_start) + irq_work_queue(&walt_cpufreq_irq_work); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 old_window_start; + + if (!rq->window_start || sched_disable_window_stats || + p->ravg.mark_start == wallclock) + return; + + lockdep_assert_held(&rq->lock); + + old_window_start = update_window_start(rq, wallclock, event); + + if (!p->ravg.mark_start) { + goto done; + } + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime); + + if (exiting_task(p)) + goto done; + +done: + p->ravg.mark_start = wallclock; + + run_walt_irq_work(old_window_start, rq); +} + +int sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec(table, write, buffer, length, ppos); + if (rc) + return rc; + + sysctl_sched_init_task_load_pct = sysctl_sched_walt_init_task_load_pct; + + return 0; +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; + u32 init_load_pct = current->init_load_pct; + + p->last_sleep_ts = 0; + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + init_load_windows_scaled = scale_demand(init_load_windows); + } + + p->ravg.demand = init_load_windows; + p->ravg.demand_scaled = init_load_windows_scaled; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +void free_task_load_ptrs(struct task_struct *p) +{ + kfree(p->ravg.curr_window_cpu); + kfree(p->ravg.prev_window_cpu); + + /* + * update_task_ravg() can be called for exiting tasks. While the + * function itself ensures correct behavior, the corresponding + * trace event requires that these pointers be NULL. + */ + p->ravg.curr_window_cpu = NULL; + p->ravg.prev_window_cpu = NULL; +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; + + if (exiting_task(p)) { + sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } + + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; +unsigned int max_power_cost = 1; + +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +struct list_head cluster_head; + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + pr_warn("Cluster allocation failed. Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->freq_init_done = false; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + cluster->efficiency = topology_get_cpu_scale(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +void walt_update_min_max_capacity(void) +{ + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int +compare_clusters(void *priv, const struct list_head *a, const struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + unsigned int tmp_max = 1; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + + if (cluster->max_power_cost > tmp_max) + tmp_max = cluster->max_power_cost; + } + max_power_cost = tmp_max; + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); +} + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .min_freq = 1, + .max_possible_freq = 1, + .exec_scale_factor = 1024, +}; + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); +} + +static unsigned long cpu_max_table_freq[NR_CPUS]; + +void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + unsigned long flags; + + cpumask_copy(&cpumask, cpus); + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + } + + __update_min_max_capacity(); + + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_CREATE_POLICY) + return 0; + + walt_update_min_max_capacity(); + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) + cpu_max_table_freq[i] = policy->cpuinfo.max_freq; + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->policy->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_DONE; + + if (cpu_cur_freq(cpu) == new_freq) + return NOTIFY_OK; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return NOTIFY_OK; +} + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_walt_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_walt_callback); + +/* + * Runs in hard-irq context. This should ideally run just after the latest + * window roll-over. + */ +void walt_irq_work(struct irq_work *irq_work) +{ + struct sched_cluster *cluster; + struct rq *rq; + int cpu; + u64 wc; + bool is_migration = false; + int level = 0; + + /* Am I the window rollover work or the migration work? */ + if (irq_work == &walt_migration_irq_work) + is_migration = true; + + for_each_cpu(cpu, cpu_possible_mask) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } + + wc = sched_ktime_clock(); + walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws); + for_each_sched_cluster(cluster) { + raw_spin_lock(&cluster->load_lock); //TODO: rtg + + for_each_cpu(cpu, &cluster->cpus) { + rq = cpu_rq(cpu); + if (rq->curr) { + update_task_ravg(rq->curr, rq, + TASK_UPDATE, wc, 0); + account_load_subtractions(rq); + } + } + + raw_spin_unlock(&cluster->load_lock); + } + + for_each_sched_cluster(cluster) { + cpumask_t cluster_online_cpus; + unsigned int num_cpus, i = 1; + + cpumask_and(&cluster_online_cpus, &cluster->cpus, + cpu_online_mask); + num_cpus = cpumask_weight(&cluster_online_cpus); + for_each_cpu(cpu, &cluster_online_cpus) { + int flag = SCHED_CPUFREQ_WALT; + + rq = cpu_rq(cpu); + + if (i == num_cpus) + cpufreq_update_util(cpu_rq(cpu), flag); + else + cpufreq_update_util(cpu_rq(cpu), flag | + SCHED_CPUFREQ_CONTINUE); + i++; + } + } + + for_each_cpu(cpu, cpu_possible_mask) + raw_spin_unlock(&cpu_rq(cpu)->lock); + + //if (!is_migration) + //core_ctl_check(this_rq()->window_start); +} + +static void walt_init_once(void) +{ + init_irq_work(&walt_migration_irq_work, walt_irq_work); + init_irq_work(&walt_cpufreq_irq_work, walt_irq_work); + + walt_cpu_util_freq_divisor = + (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100; + walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT; + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + sched_init_task_load_windows_scaled = + scale_demand(sched_init_task_load_windows); +} + +void walt_sched_init_rq(struct rq *rq) +{ + static bool init; + int j; + + if (!init) { + walt_init_once(); + init = true; + } + + cpumask_set_cpu(cpu_of(rq), &rq->freq_domain_cpumask); + + rq->walt_stats.cumulative_runnable_avg_scaled = 0; + rq->window_start = 0; + rq->walt_flags = 0; + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + rq->cum_window_demand_scaled = 0; + + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) { + memset(&rq->load_subs[j], 0, sizeof(struct load_subtractions)); + } +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..e22961349e9c --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,236 @@ +/* + * walt.h + * + * head file for Window-Assistant-Load-Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +#include + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +#define SCHED_NEW_TASK_WINDOWS 5 + +extern unsigned int sched_ravg_window; +extern unsigned int sysctl_sched_walt_init_task_load_pct; + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; +} + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +static inline void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +extern void reset_task_stats(struct task_struct *p); +extern void update_cluster_topology(void); +extern void init_clusters(void); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); + +static inline void +fixup_cumulative_runnable_avg(struct walt_sched_stats *stats, + s64 demand_scaled_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg_scaled += demand_scaled_delta; + BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0); +} + +static inline void +walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled); + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled); +} + +static inline void +walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, + -(s64)p->ravg.demand_scaled); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled); +} +extern void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +extern void inc_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void dec_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void init_new_task_load(struct task_struct *p); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +void account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock); + +void walt_irq_work(struct irq_work *irq_work); + +void walt_sched_init_rq(struct rq *rq); + +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +#else /* CONFIG_SCHED_WALT */ +static inline void walt_sched_init_rq(struct rq *rq) { } + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void +inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void init_new_task_load(struct task_struct *p) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void update_cluster_topology(void) { } +static inline void init_clusters(void) { } + +static inline void +fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) { } + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) { } + +static inline u64 sched_irqload(int cpu) +{ + return 0; +} +static inline int sched_cpu_high_irqload(int cpu) +{ + return 0; +} +#endif /* CONFIG_SCHED_WALT */ + +#endif /* __WALT_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1d8b4358aa11..f13b9e456f50 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,40 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_walt_init_task_load_pct_sysctl_handler, + }, + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", -- Gitee From b9c077b7601932dfbec50222f2bcc29ae11a1468 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Wed, 29 Dec 2021 23:49:49 +0800 Subject: [PATCH 2/2] maillist inclusion category: feature issue: #I4LKQ0 CVE: NA ------------------------------------------- Core control monitors load on CPUs and controls how many CPUs are available for the system to use at any point in time. This can help save power. Core control can be configured through sysfs interface. Add bitmask and corresponding supporting functions for cpu isolation. Signed-off-by: Olav Haugan Signed-off-by: Vikram Mulukutla Signed-off-by: Satya Durga Srinivasu Prabhala Signed-off-by: tangyizhou --- drivers/base/cpu.c | 44 ++ include/linux/cpuhotplug.h | 3 + include/linux/cpumask.h | 36 + include/linux/device.h | 1 + include/linux/hrtimer.h | 6 + include/linux/nmi.h | 21 + include/linux/sched.h | 35 + include/linux/sched/core_ctl.h | 36 + include/linux/sched/isolation.h | 19 + include/linux/sched/stat.h | 9 + include/linux/stop_machine.h | 9 + include/linux/timer.h | 3 + include/trace/events/sched.h | 156 ++++ kernel/cpu.c | 11 + kernel/irq/cpuhotplug.c | 49 +- kernel/irq/proc.c | 5 + kernel/sched/Makefile | 2 +- kernel/sched/core.c | 429 +++++++++- kernel/sched/core_ctl.c | 1307 +++++++++++++++++++++++++++++++ kernel/sched/core_ctl.h | 27 + kernel/sched/fair.c | 113 ++- kernel/sched/rt.c | 11 +- kernel/sched/sched.h | 49 ++ kernel/sched/sched_avg.c | 194 +++++ kernel/sched/topology.c | 11 +- kernel/smp.c | 3 +- kernel/stop_machine.c | 2 +- kernel/time/hrtimer.c | 140 ++-- kernel/time/timer.c | 40 +- kernel/watchdog.c | 36 +- mm/vmstat.c | 5 +- 31 files changed, 2671 insertions(+), 141 deletions(-) create mode 100644 include/linux/sched/core_ctl.h create mode 100644 kernel/sched/core_ctl.c create mode 100644 kernel/sched/core_ctl.h create mode 100644 kernel/sched/sched_avg.c diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 8f1d6569564c..0977edfc305e 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -180,9 +180,42 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU +static ssize_t isolate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t rc; + int cpuid = cpu->dev.id; + unsigned int isolated = cpu_isolated(cpuid); + + rc = snprintf(buf, PAGE_SIZE-2, "%d\n", isolated); + + return rc; +} + +static DEVICE_ATTR_RO(isolate); + +static struct attribute *cpu_isolated_attrs[] = { + &dev_attr_isolate.attr, + NULL +}; + +static struct attribute_group cpu_isolated_attr_group = { + .attrs = cpu_isolated_attrs, +}; +#endif +#endif + static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, +#endif #endif NULL }; @@ -190,6 +223,11 @@ static const struct attribute_group *common_cpu_attr_groups[] = { static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU + &cpu_isolated_attr_group, +#endif #endif NULL }; @@ -220,6 +258,9 @@ static struct cpu_attr cpu_attrs[] = { _CPU_ATTR(online, &__cpu_online_mask), _CPU_ATTR(possible, &__cpu_possible_mask), _CPU_ATTR(present, &__cpu_present_mask), +#ifdef CONFIG_SCHED_WALT + _CPU_ATTR(core_ctl_isolated, &__cpu_isolated_mask), +#endif }; /* @@ -465,6 +506,9 @@ static struct attribute *cpu_root_attrs[] = { &cpu_attrs[0].attr.attr, &cpu_attrs[1].attr.attr, &cpu_attrs[2].attr.attr, +#ifdef CONFIG_SCHED_WALT + &cpu_attrs[3].attr.attr, +#endif &dev_attr_kernel_max.attr, &dev_attr_offline.attr, &dev_attr_isolated.attr, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8fb893ed205e..9be54199902b 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -71,6 +71,9 @@ enum cpuhp_state { CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, +#ifdef CONFIG_SCHED_WALT + CPUHP_CORE_CTL_ISOLATION_DEAD, +#endif CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f0d895d6ac39..34d869d18e16 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -55,6 +55,7 @@ extern unsigned int nr_cpu_ids; * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration + * cpu_isolated_mask- has bit 'cpu' set iff cpu isolated * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * @@ -96,6 +97,11 @@ extern struct cpumask __cpu_active_mask; #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#ifdef CONFIG_SCHED_WALT +extern struct cpumask __cpu_isolated_mask; +#define cpu_isolated_mask ((const struct cpumask *)&__cpu_isolated_mask) +#endif + extern atomic_t __num_online_cpus; #if NR_CPUS > 1 @@ -129,6 +135,22 @@ static inline unsigned int num_online_cpus(void) #define cpu_active(cpu) ((cpu) == 0) #endif +#if defined(CONFIG_SCHED_WALT) && NR_CPUS > 1 +#define num_isolated_cpus() cpumask_weight(cpu_isolated_mask) +#define num_online_uniso_cpus() \ +({ \ + cpumask_t mask; \ + \ + cpumask_andnot(&mask, cpu_online_mask, cpu_isolated_mask); \ + cpumask_weight(&mask); \ +}) +#define cpu_isolated(cpu) cpumask_test_cpu((cpu), cpu_isolated_mask) +#else /* !CONFIG_SCHED_WALT || NR_CPUS == 1 */ +#define num_isolated_cpus() 0U +#define num_online_uniso_cpus() num_online_cpus() +#define cpu_isolated(cpu) 0U +#endif + extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) @@ -811,6 +833,9 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) +#ifdef CONFIG_SCHED_WALT +#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask) +#endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); @@ -851,6 +876,17 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } +#ifdef CONFIG_SCHED_WALT +static inline void +set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, &__cpu_isolated_mask); + else + cpumask_clear_cpu(cpu, &__cpu_isolated_mask); +} +#endif + /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * diff --git a/include/linux/device.h b/include/linux/device.h index 5dc0f81e4f9d..de0c8ead4696 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -813,6 +813,7 @@ static inline bool device_supports_offline(struct device *dev) void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); +void lock_device_hotplug_assert(void); int device_offline(struct device *dev); int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7f1b8549ebce..aa0cd15cb05c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -74,6 +74,7 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree + * 0x02 timer is pinned to a cpu * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free @@ -93,6 +94,8 @@ enum hrtimer_restart { */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_PINNED_SHIFT 1 +#define HRTIMER_STATE_PINNED (1 << HRTIMER_PINNED_SHIFT) /** * struct hrtimer - the basic hrtimer structure @@ -367,6 +370,9 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) /* Exported timer functions: */ +/* To be used from cpusets, only */ +extern void hrtimer_quiesce_cpu(void *cpup); + /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 750c7f395ca9..baa964d5d38c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -13,6 +13,11 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +#ifdef CONFIG_SCHED_WALT +extern void watchdog_enable(unsigned int cpu); +extern void watchdog_disable(unsigned int cpu); +extern bool watchdog_configured(unsigned int cpu); +#endif void lockup_detector_soft_poweroff(void); void lockup_detector_cleanup(void); bool is_hardlockup(void); @@ -37,6 +42,22 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } static inline void lockup_detector_cleanup(void) { } +#ifdef CONFIG_SCHED_WALT +static inline void watchdog_enable(unsigned int cpu) +{ +} +static inline void watchdog_disable(unsigned int cpu) +{ +} +static inline bool watchdog_configured(unsigned int cpu) +{ + /* + * Pretend the watchdog is always configured. + * We will be waiting for the watchdog to be enabled in core isolation + */ + return true; +} +#endif #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/include/linux/sched.h b/include/linux/sched.h index 194b96b59d67..66e26f69967c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -222,6 +222,41 @@ enum task_event { IRQ_UPDATE = 5, }; +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SCHED_WALT) +extern int sched_isolate_count(const cpumask_t *mask, bool include_offline); +extern int sched_isolate_cpu(int cpu); +extern int sched_unisolate_cpu(int cpu); +extern int sched_unisolate_cpu_unlocked(int cpu); +#else +static inline int sched_isolate_count(const cpumask_t *mask, + bool include_offline) +{ + cpumask_t count_mask; + + if (include_offline) + cpumask_andnot(&count_mask, mask, cpu_online_mask); + else + return 0; + + return cpumask_weight(&count_mask); +} + +static inline int sched_isolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu(int cpu) +{ + return 0; +} + +static inline int sched_unisolate_cpu_unlocked(int cpu) +{ + return 0; +} +#endif + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/include/linux/sched/core_ctl.h b/include/linux/sched/core_ctl.h new file mode 100644 index 000000000000..57f14b52c941 --- /dev/null +++ b/include/linux/sched/core_ctl.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, 2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +struct core_ctl_notif_data { + unsigned int nr_big; + unsigned int coloc_load_pct; +}; + +extern void core_ctl_check(u64 wallclock); + +#ifdef CONFIG_SCHED_WALT +int core_ctl_set_boost(bool boost); +void core_ctl_notifier_register(struct notifier_block *n); +void core_ctl_notifier_unregister(struct notifier_block *n); +#else +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +static inline void core_ctl_notifier_register(struct notifier_block *n) {} +static inline void core_ctl_notifier_unregister(struct notifier_block *n) {} +#endif +#endif diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cc9f393e2a70..049a1bba90bc 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -28,10 +28,25 @@ extern void __init housekeeping_init(void); #else +#ifdef CONFIG_SCHED_WALT +static inline int housekeeping_any_cpu(enum hk_flags flags) +{ + cpumask_t available; + int cpu; + + cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask); + cpu = cpumask_any(&available); + if (cpu >= nr_cpu_ids) + cpu = smp_processor_id(); + + return cpu; +} +#else static inline int housekeeping_any_cpu(enum hk_flags flags) { return smp_processor_id(); } +#endif static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { @@ -54,7 +69,11 @@ static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, flags); #endif +#ifdef CONFIG_SCHED_WALT + return !cpu_isolated(cpu); +#else return true; +#endif } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..ca8b0d1ccf94 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -21,6 +21,15 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_get_cpu_util(int cpu); +#else +static inline unsigned int sched_get_cpu_util(int cpu) +{ + return 0; +} +#endif + static inline int sched_info_on(void) { #ifdef CONFIG_SCHEDSTATS diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 63ea9aff368f..6037d00555c7 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -32,6 +32,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); @@ -80,6 +81,14 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline int stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) + return stop_one_cpu(raw_smp_processor_id(), fn, arg); + return -ENOENT; +} + #endif /* CONFIG_SMP */ /* diff --git a/include/linux/timer.h b/include/linux/timer.h index d10bc7e73b41..8cb904ce3dca 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -180,6 +180,9 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +/* To be used from cpusets, only */ +extern void timer_quiesce_cpu(void *cpup); + extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6..245671bc6292 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -600,6 +600,162 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); +#ifdef CONFIG_SCHED_WALT + +TRACE_EVENT(core_ctl_eval_need, + + TP_PROTO(unsigned int cpu, unsigned int old_need, + unsigned int new_need, unsigned int updated), + TP_ARGS(cpu, old_need, new_need, updated), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, old_need) + __field(u32, new_need) + __field(u32, updated) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->old_need = old_need; + __entry->new_need = new_need; + __entry->updated = updated; + ), + TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu, + __entry->old_need, __entry->new_need, __entry->updated) +); + +TRACE_EVENT(core_ctl_set_busy, + + TP_PROTO(unsigned int cpu, unsigned int busy, + unsigned int old_is_busy, unsigned int is_busy), + TP_ARGS(cpu, busy, old_is_busy, is_busy), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, busy) + __field(u32, old_is_busy) + __field(u32, is_busy) + __field(bool, high_irqload) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->busy = busy; + __entry->old_is_busy = old_is_busy; + __entry->is_busy = is_busy; + __entry->high_irqload = sched_cpu_high_irqload(cpu); + ), + TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u high_irqload=%d", + __entry->cpu, __entry->busy, __entry->old_is_busy, + __entry->is_busy, __entry->high_irqload) +); + +TRACE_EVENT(core_ctl_set_boost, + + TP_PROTO(u32 refcount, s32 ret), + TP_ARGS(refcount, ret), + TP_STRUCT__entry( + __field(u32, refcount) + __field(s32, ret) + ), + TP_fast_assign( + __entry->refcount = refcount; + __entry->ret = ret; + ), + TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret) +); + +TRACE_EVENT(core_ctl_update_nr_need, + + TP_PROTO(int cpu, int nr_need, int prev_misfit_need, + int nrrun, int max_nr, int nr_prev_assist), + + TP_ARGS(cpu, nr_need, prev_misfit_need, nrrun, max_nr, nr_prev_assist), + + TP_STRUCT__entry( + __field( int, cpu) + __field( int, nr_need) + __field( int, prev_misfit_need) + __field( int, nrrun) + __field( int, max_nr) + __field( int, nr_prev_assist) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_need = nr_need; + __entry->prev_misfit_need = prev_misfit_need; + __entry->nrrun = nrrun; + __entry->max_nr = max_nr; + __entry->nr_prev_assist = nr_prev_assist; + ), + + TP_printk("cpu=%d nr_need=%d prev_misfit_need=%d nrrun=%d max_nr=%d nr_prev_assist=%d", + __entry->cpu, __entry->nr_need, __entry->prev_misfit_need, + __entry->nrrun, __entry->max_nr, __entry->nr_prev_assist) +); +/* + * Tracepoint for sched_get_nr_running_avg + */ +TRACE_EVENT(sched_get_nr_running_avg, + + TP_PROTO(int cpu, int nr, int nr_misfit, int nr_max), + + TP_ARGS(cpu, nr, nr_misfit, nr_max), + + TP_STRUCT__entry( + __field( int, cpu) + __field( int, nr) + __field( int, nr_misfit) + __field( int, nr_max) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr = nr; + __entry->nr_misfit = nr_misfit; + __entry->nr_max = nr_max; + ), + + TP_printk("cpu=%d nr=%d nr_misfit=%d nr_max=%d", + __entry->cpu, __entry->nr, __entry->nr_misfit, __entry->nr_max) +); + +/* + * sched_isolate - called when cores are isolated/unisolated + * + * @acutal_mask: mask of cores actually isolated/unisolated + * @req_mask: mask of cores requested isolated/unisolated + * @online_mask: cpu online mask + * @time: amount of time in us it took to isolate/unisolate + * @isolate: 1 if isolating, 0 if unisolating + * + */ +TRACE_EVENT(sched_isolate, + + TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus, + u64 start_time, unsigned char isolate), + + TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate), + + TP_STRUCT__entry( + __field(u32, requested_cpu) + __field(u32, isolated_cpus) + __field(u32, time) + __field(unsigned char, isolate) + ), + + TP_fast_assign( + __entry->requested_cpu = requested_cpu; + __entry->isolated_cpus = isolated_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->isolate = isolate; + ), + + TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d", + __entry->requested_cpu, __entry->isolated_cpus, + __entry->time, __entry->isolate) +); + +#endif + /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. diff --git a/kernel/cpu.c b/kernel/cpu.c index 67c22941b5f2..fde59877742d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1052,6 +1052,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -2495,6 +2498,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +struct cpumask __cpu_isolated_mask __read_mostly; +EXPORT_SYMBOL(__cpu_isolated_mask); + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); @@ -2513,6 +2519,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(&__cpu_isolated_mask, src); +} + void set_cpu_online(unsigned int cpu, bool online) { /* diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 02236b13b359..89981adfb748 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "internals.h" @@ -58,6 +59,9 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity; bool brokeaff = false; int err; +#ifdef CONFIG_SCHED_WALT + struct cpumask available_cpus; +#endif /* * IRQ chip might be already torn down, but the irq descriptor is @@ -110,7 +114,17 @@ static bool migrate_one_irq(struct irq_desc *desc) if (maskchip && chip->irq_mask) chip->irq_mask(d); +#ifdef CONFIG_SCHED_WALT + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; +#endif + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { +#ifdef CONFIG_SCHED_WALT + const struct cpumask *default_affinity; +#endif + /* * If the interrupt is managed, then shut it down and leave * the affinity untouched. @@ -120,16 +134,47 @@ static bool migrate_one_irq(struct irq_desc *desc) irq_shutdown_and_deactivate(desc); return false; } + +#ifdef CONFIG_SCHED_WALT + default_affinity = desc->affinity_hint ? : irq_default_affinity; + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_intersects(&available_cpus, default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + default_affinity); + else if (cpumask_empty(&available_cpus)) + affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); +#else affinity = cpu_online_mask; +#endif brokeaff = true; } /* - * Do not set the force argument of irq_do_set_affinity() as this + * Do not set the force argument of irq_set_affinity_locked() as this * disables the masking of offline CPUs from the supplied affinity * mask and therefore might keep/reassign the irq to the outgoing * CPU. */ - err = irq_do_set_affinity(d, affinity, false); + err = irq_set_affinity_locked(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc..b6e4c4930b04 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -154,6 +154,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (err) goto free_cpumask; + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 75ab238bde9d..b74f3553f0b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -27,7 +27,7 @@ obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -obj-$(CONFIG_SCHED_WALT) += walt.o +obj-$(CONFIG_SCHED_WALT) += walt.o core_ctl.o sched_avg.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41072cfe0c57..e8c671e7d3b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include @@ -1886,6 +1888,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq_flags rf; struct rq *rq; int ret = 0; +#ifdef CONFIG_SCHED_WALT + cpumask_t allowed_mask; +#endif rq = task_rq_lock(p, &rf); update_rq_clock(rq); @@ -1909,6 +1914,20 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask); + + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + cpumask_and(&allowed_mask, cpu_valid_mask, new_mask); + dest_cpu = cpumask_any(&allowed_mask); + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + ret = -EINVAL; + goto out; + } + } +#else /* * Picking a ~random cpu helps in cases where we are changing affinity * for groups of tasks (ie. cpuset), so that load balancing is not @@ -1919,6 +1938,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ret = -EINVAL; goto out; } +#endif do_set_cpus_allowed(p, new_mask); @@ -1933,8 +1953,13 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, } /* Can the task run on the task's current CPU? If so, we're done */ +#ifdef CONFIG_SCHED_WALT + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) + goto out; +#else if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; +#endif if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; @@ -2286,12 +2311,15 @@ EXPORT_SYMBOL_GPL(kick_process); * select_task_rq() below may allow selection of !active CPUs in order * to satisfy the above rules. */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; + enum { cpuset, possible, fail, bug } state = cpuset; int dest_cpu; +#ifdef CONFIG_SCHED_WALT + int isolated_candidate = -1; +#endif /* * If the node that the CPU is on has been offlined, cpu_to_node() @@ -2305,6 +2333,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } @@ -2315,7 +2345,18 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; +#ifdef CONFIG_SCHED_WALT + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; +#endif goto out; } @@ -2334,6 +2375,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) break; case fail: +#ifdef CONFIG_SCHED_WALT + + allow_iso = true; + state = bug; + break; +#else + /* fall through */ +#endif + + case bug: BUG(); break; } @@ -2361,6 +2412,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -2378,8 +2431,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!is_cpu_allowed(p, cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + if (unlikely(!is_cpu_allowed(p, cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } @@ -3932,7 +3986,7 @@ void sched_exec(void) if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5566,10 +5620,11 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) * Return: 0 on success. An error code otherwise. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) + const struct sched_param *param) { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally @@ -5928,6 +5983,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; +#ifdef CONFIG_SCHED_WALT + int dest_cpu; + cpumask_t allowed_mask; +#endif rcu_read_lock(); @@ -5989,20 +6048,30 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); + if (dest_cpu < nr_cpu_ids) { +#endif + retval = __set_cpus_allowed_ptr(p, new_mask, true); + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } } +#ifdef CONFIG_SCHED_WALT + } else { + retval = -EINVAL; } +#endif + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -6066,6 +6135,16 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + +#ifdef CONFIG_SCHED_WALT + /* The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); +#endif + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -6753,20 +6832,66 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) BUG(); } +#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_HOTPLUG_CPU /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Remove a task from the runqueue and pretend that it's migrating. This + * should prevent migrations for the detached task and disallow further + * changes to tsk_cpus_allowed. + */ +void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ + lockdep_assert_held(&rq->lock); + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(rq, p, 0); + list_add(&p->se.group_node, tasks); +} + +void attach_tasks_core(struct list_head *tasks, struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_held(&rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + BUG_ON(task_rq(p) != rq); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } +} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SCHED_WALT */ + +/* + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; struct rq_flags orf = *rf; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + LIST_HEAD(tasks); + cpumask_t avail_cpus; + +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); +#else + cpumask_copy(&avail_cpus, cpu_online_mask); +#endif /* * Fudge the rq selection such that the below task selection loop @@ -6789,13 +6914,20 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) for (;;) { /* * There's this thread running, bail when that's the only - * remaining thread: + * remaining thread. */ if (rq->nr_running == 1) break; next = __pick_migrate_task(rq); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_mask)) { + detach_one_task_core(next, rq, &tasks); + num_pinned_kthreads += 1; + continue; + } + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -6808,31 +6940,278 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); /* * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. + * However, during cpu isolation the load balancer might have + * interferred since we don't stop all CPUs. Ignore warning for + * this case. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + WARN_ON(migrate_pinned_tasks); raw_spin_unlock(&next->pi_lock); continue; } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { rq_unlock(rq, rf); rq = dead_rq; *rf = orf; rq_relock(rq, rf); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop; + + if (num_pinned_kthreads > 1) + attach_tasks_core(&tasks, rq); } + +void set_rq_online(struct rq *rq); +void set_rq_offline(struct rq *rq); + +#ifdef CONFIG_SCHED_WALT + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + watchdog_disable(cpu); + + local_irq_disable(); + + irq_migrate_all_off_this_cpu(); + + flush_smp_call_function_from_idle(); + + /* Update our root-domain */ + rq_lock(rq, &rf); + + /* + * Temporarily mark the rq as offline. This will allow us to + * move tasks off the CPU. + */ + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, &rf, false); + + if (rq->rd) + set_rq_online(rq); + rq_unlock(rq, &rf); + + local_irq_enable(); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); +} + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq; + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + cpu_maps_update_begin(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) || + !cpu_online(cpu) || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + rq = cpu_rq(cpu); + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + --cpu_isolation_vote[cpu]; + ret_code = -EINVAL; + goto out; + } + + /* + * There is a race between watchdog being enabled by hotplug and + * core isolation disabling the watchdog. When a CPU is hotplugged in + * and the hotplug lock has been released the watchdog thread might + * not have run yet to enable the watchdog. + * We have to wait for the watchdog to be enabled before proceeding. + */ + if (!watchdog_configured(cpu)) { + msleep(20); + if (!watchdog_configured(cpu)) { + --cpu_isolation_vote[cpu]; + ret_code = -EBUSY; + goto out; + } + } + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + watchdog_disable(cpu); + irq_lock_sparse(); + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + irq_unlock_sparse(); + + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + cpu_maps_update_done(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + u64 start_time = 0; + + if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) + || cpu >= NR_CPUS) { + ret_code = -EINVAL; + goto out; + } + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + cpu_maps_update_begin(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + cpu_maps_update_done(); + return ret_code; +} + +#endif /* CONFIG_SCHED_WALT */ + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -7020,7 +7399,7 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); + migrate_tasks(rq, &rf, true); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..b5991805fe80 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1307 @@ +/* Copyright (c) 2014-2018, 2020, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sched.h" +#include "walt.h" + +#define MAX_CPUS_PER_CLUSTER 6 +#define MAX_CLUSTERS 3 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + unsigned int nr_not_preferred_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + unsigned int nr_prev_assist; + unsigned int nr_prev_assist_thresh; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool enable; + int nrrun; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +ATOMIC_NOTIFIER_HEAD(core_ctl_notifier); +static unsigned int last_nr_big; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); +static void cpuset_next(struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + cpuset_next(state); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + cpuset_next(state); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_nr_prev_assist_thresh(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->nr_prev_assist_thresh); +} + +static ssize_t store_nr_prev_assist_thresh(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->nr_prev_assist_thresh = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNot preferred: %u\n", + c->not_preferred); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + int not_preferred_count = 0; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != state->num_cpus) + return -EINVAL; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + c->not_preferred = val[i]; + not_preferred_count += !!val[i]; + } + state->nr_not_preferred_cpus = not_preferred_count; + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + int i; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU#%d: %u\n", c->cpu, c->not_preferred); + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(nr_prev_assist_thresh); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &nr_prev_assist_thresh.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +static struct sched_avg_stats nr_stats[NR_CPUS]; + +/* + * nr_need: + * Number of tasks running on this cluster plus + * tasks running on higher capacity clusters. + * To find out CPUs needed from this cluster. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 4 small tasks running on min capacity CPUs + * and 2 big tasks running on 2 max capacity + * CPUs, nr_need has to be 6 for min capacity + * cluster and 2 for max capacity cluster. + * This is because, min capacity cluster has to + * account for tasks running on max capacity + * cluster, so that, the min capacity cluster + * can be ready to accommodate tasks running on max + * capacity CPUs if the demand of tasks goes down. + */ +static int compute_cluster_nr_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_need = 0; + + for_each_cluster(cluster, index) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_need += nr_stats[cpu].nr; + } + + return nr_need; +} + +/* + * prev_misfit_need: + * Tasks running on smaller capacity cluster which + * needs to be migrated to higher capacity cluster. + * To find out how many tasks need higher capacity CPUs. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 2 small tasks and 2 big tasks running on + * min capacity CPUs and no tasks running on + * max cpacity, prev_misfit_need of min capacity + * cluster will be 0 and prev_misfit_need of + * max capacity cluster will be 2. + */ +static int compute_prev_cluster_misfit_need(int index) +{ + int cpu; + struct cluster_data *prev_cluster; + int prev_misfit_need = 0; + + /* + * Lowest capacity cluster does not have to + * accommodate any misfit tasks. + */ + if (index == 0) + return 0; + + prev_cluster = &cluster_state[index - 1]; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + prev_misfit_need += nr_stats[cpu].nr_misfit; + + return prev_misfit_need; +} + +static int compute_cluster_max_nr(int index) +{ + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + int max_nr = 0; + + for_each_cpu(cpu, &cluster->cpu_mask) + max_nr = max(max_nr, nr_stats[cpu].nr_max); + + return max_nr; +} + +static int cluster_real_big_tasks(int index) +{ + int nr_big = 0; + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + + if (index == 0) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr_misfit; + } else { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr; + } + + return nr_big; +} + +/* + * prev_nr_need_assist: + * Tasks that are eligible to run on the previous + * cluster but cannot run because of insufficient + * CPUs there. prev_nr_need_assist is indicative + * of number of CPUs in this cluster that should + * assist its previous cluster to makeup for + * insufficient CPUs there. + * + * For example: + * On tri-cluster system with 4 min capacity + * CPUs, 3 intermediate capacity CPUs and 1 + * max capacity CPU, if there are 4 small + * tasks running on min capacity CPUs, 4 big + * tasks running on intermediate capacity CPUs + * and no tasks running on max capacity CPU, + * prev_nr_need_assist for min & max capacity + * clusters will be 0, but, for intermediate + * capacity cluster prev_nr_need_assist will + * be 1 as it has 3 CPUs, but, there are 4 big + * tasks to be served. + */ +static int prev_cluster_nr_need_assist(int index) +{ + int need = 0; + int cpu; + struct cluster_data *prev_cluster; + + if (index == 0) + return 0; + + index--; + prev_cluster = &cluster_state[index]; + + /* + * Next cluster should not assist, while there are isolated cpus + * in this cluster. + */ + if (prev_cluster->nr_isolated_cpus) + return 0; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + need += nr_stats[cpu].nr; + + need += compute_prev_cluster_misfit_need(index); + + if (need > prev_cluster->active_cpus) + need = need - prev_cluster->active_cpus; + else + need = 0; + + return need; +} + +static void update_running_avg(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + int big_avg = 0; + + sched_get_nr_running_avg(nr_stats); + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + int nr_need, prev_misfit_need; + + if (!cluster->inited) + continue; + + nr_need = compute_cluster_nr_need(index); + prev_misfit_need = compute_prev_cluster_misfit_need(index); + + + cluster->nrrun = nr_need + prev_misfit_need; + cluster->max_nr = compute_cluster_max_nr(index); + cluster->nr_prev_assist = prev_cluster_nr_need_assist(index); + + trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need, + prev_misfit_need, + cluster->nrrun, cluster->max_nr, + cluster->nr_prev_assist); + + big_avg += cluster_real_big_tasks(index); + } + spin_unlock_irqrestore(&state_lock, flags); + + last_nr_big = big_avg; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* + * unisolate as many cores as the previous cluster + * needs assistance with. + */ + if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh) + new_need = new_need + cluster->nr_prev_assist; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + bool old_is_busy = c->is_busy; + + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + + trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy, + c->is_busy); + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + /* + * When there is no change in need and there are no more + * active CPUs than currently needed, just update the + * need time stamp and return. + */ + if (new_need == last_need && new_need == cluster->active_cpus) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster = NULL; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + ret = -EINVAL; + break; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) { + index = 0; + for_each_cluster(cluster, index) + apply_need(cluster); + } + + if (cluster) + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&core_ctl_notifier, n); +} + +void core_ctl_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&core_ctl_notifier, n); +} + +static void core_ctl_call_notifier(void) +{ + struct core_ctl_notif_data ndata; + struct notifier_block *nb; + + /* + * Don't bother querying the stats when the notifier + * chain is empty. + */ + rcu_read_lock(); + nb = rcu_dereference_raw(core_ctl_notifier.head); + rcu_read_unlock(); + + if (!nb) + return; + + ndata.nr_big = last_nr_big; + + atomic_notifier_call_chain(&core_ctl_notifier, 0, &ndata); +} + +void core_ctl_check(u64 window_start) +{ + int cpu; + struct cpu_data *c; + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + + if (unlikely(!initialized)) + return; + + if (window_start == core_ctl_check_timestamp) + return; + + core_ctl_check_timestamp = window_start; + + spin_lock_irqsave(&state_lock, flags); + for_each_possible_cpu(cpu) { + + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + + if (!cluster || !cluster->inited) + continue; + + c->busy = sched_get_cpu_util(cpu); + } + spin_unlock_irqrestore(&state_lock, flags); + + update_running_avg(); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } + + core_ctl_call_notifier(); +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void cpuset_next(struct cluster_data *cluster) { } + +static bool should_we_isolate(int cpu, struct cluster_data *cluster) +{ + return true; +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + bool first_pass = cluster->nr_not_preferred_cpus; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't isolate busy CPUs. */ + if (c->is_busy) + continue; + + /* + * We isolate only the not_preferred CPUs. If none + * of the CPUs are selected as not_preferred, then + * all CPUs are eligible for isolation. + */ + if (cluster->nr_not_preferred_cpus && !c->not_preferred) + continue; + + if (!should_we_isolate(c->cpu, cluster)) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +again: + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + if (first_pass && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + if (first_pass && cluster->active_cpus > cluster->max_cpus) { + first_pass = false; + goto again; + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int isolation_cpuhp_state(unsigned int cpu, bool online) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup = false, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return 0; + + if (online) { + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + } else { + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return 0; +} + +static int core_ctl_isolation_online_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, true); +} + +static int core_ctl_isolation_dead_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, false); +} + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nr_prev_assist_thresh = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + cluster->nr_not_preferred_cpus = 0; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int __init core_ctl_init(void) +{ + struct sched_cluster *cluster; + int ret; + + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "core_ctl/isolation:online", + core_ctl_isolation_online_cpu, NULL); + + cpuhp_setup_state_nocalls(CPUHP_CORE_CTL_ISOLATION_DEAD, + "core_ctl/isolation:dead", + NULL, core_ctl_isolation_dead_cpu); + + for_each_sched_cluster(cluster) { + ret = cluster_init(&cluster->cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + } + + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100644 index 000000000000..98d7cb3e899b --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTL +void core_ctl_check(u64 wallclock); +int core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +#endif +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc8529ccb6a9..9f70422acc7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6237,6 +6237,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; + if (cpu_isolated(cpu)) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } @@ -6308,15 +6310,15 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + !cpu_isolated(target) && asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + ((available_idle_cpu(prev) || sched_idle_cpu(prev)) && + !cpu_isolated(target) && asym_fits_capacity(task_util, prev))) return prev; /* @@ -8290,6 +8292,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) for_each_cpu(cpu, sched_group_span(sdg)) { unsigned long cpu_cap = capacity_of(cpu); + if (cpu_isolated(cpu)) + continue; + capacity += cpu_cap; min_capacity = min(cpu_cap, min_capacity); max_capacity = max(cpu_cap, max_capacity); @@ -8303,10 +8308,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { struct sched_group_capacity *sgc = group->sgc; - - capacity += sgc->capacity; - min_capacity = min(sgc->min_capacity, min_capacity); - max_capacity = max(sgc->max_capacity, max_capacity); + __maybe_unused cpumask_t *cpus = + sched_group_span(group); + + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, + min_capacity); + max_capacity = max(sgc->max_capacity, + max_capacity); + } group = group->next; } while (group != child->groups); } @@ -8514,6 +8525,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + if (cpu_isolated(i)) + continue; + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; @@ -8555,11 +8569,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_type = group_has_spare; + sgs->group_weight = group->group_weight; + return; + } + /* Check if dst CPU is idle and preferred to this group */ if (env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { sgs->group_asym_packing = 1; } @@ -9617,6 +9640,17 @@ static int need_active_balance(struct lb_env *env) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_WALT) +int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_span(sg), group_balance_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} +#endif /* CONFIG_SMP && CONFIG_SCHED_WALT */ + static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9640,7 +9674,7 @@ static int should_we_balance(struct lb_env *env) /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) || cpu_isolated(cpu)) continue; /* Are we the first idle CPU? */ @@ -9648,7 +9682,7 @@ static int should_we_balance(struct lb_env *env) } /* Are we the first CPU of this group ? */ - return group_balance_cpu(sg) == env->dst_cpu; + return group_balance_cpu_not_isolated(sg) == env->dst_cpu; } /* @@ -9850,7 +9884,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -10064,7 +10099,17 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + unsigned int available_cpus; +#ifdef CONFIG_SCHED_WALT + cpumask_t avail_mask; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); +#else + available_cpus = num_online_cpus(); +#endif + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -10194,6 +10239,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { +#ifdef CONFIG_SCHED_WALT + if (cpu_isolated(ilb)) + continue; +#endif if (idle_cpu(ilb)) return ilb; } @@ -10248,6 +10297,7 @@ static void nohz_balancer_kick(struct rq *rq) struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsigned int flags = 0; + cpumask_t cpumask; if (unlikely(rq->idle_balance)) return; @@ -10262,8 +10312,15 @@ static void nohz_balancer_kick(struct rq *rq) * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask); + if (cpumask_empty(&cpumask)) + return; +#else + cpumask_copy(&cpumask, nohz.idle_cpus_mask); if (likely(!atomic_read(&nohz.nr_cpus))) return; +#endif if (READ_ONCE(nohz.has_blocked) && time_after(now, READ_ONCE(nohz.next_blocked))) @@ -10299,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. */ - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + for_each_cpu_and(i, sched_domain_span(sd), &cpumask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; @@ -10477,6 +10534,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int balance_cpu; int ret = false; struct rq *rq; + cpumask_t cpus; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10496,7 +10554,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); +#else + cpumask_copy(&cpus, nohz.idle_cpus_mask); +#endif + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -10647,6 +10711,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we @@ -10760,6 +10827,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + /* + * Since core isolation doesn't update nohz.idle_cpus_mask, there + * is a possibility this nohz kicked cpu could be isolated. Hence + * return if the cpu is isolated. + */ + if (cpu_isolated(this_rq->cpu)) + return; + /* * If this CPU has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle CPUs whose ticks are @@ -10781,8 +10856,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5938cf2e421b..6c1475950441 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,8 +265,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -2279,7 +2283,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; rt_queue_pull_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 21aa5b081b96..503a7d147ac5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -141,6 +141,10 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +#ifdef CONFIG_SMP +extern void init_sched_groups_capacity(int cpu, struct sched_domain *sd); +#endif + extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution @@ -2897,6 +2901,11 @@ static inline int compute_load_scale_factor(struct sched_cluster *cluster) return load_scale; } +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + static inline bool is_max_capacity_cpu(int cpu) { return cpu_max_possible_capacity(cpu) == max_possible_capacity; @@ -2959,6 +2968,9 @@ static inline unsigned long cpu_util_freq_walt(int cpu) return (util >= capacity) ? capacity : util; } #else /* CONFIG_SCHED_WALT */ + +static inline bool hmp_capable(void) { return false; } + static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) { } @@ -2975,3 +2987,40 @@ static inline int is_reserved(int cpu) static inline void clear_reserved(int cpu) { } #endif /* CONFIG_SCHED_WALT */ + +struct sched_avg_stats { + int nr; + int nr_misfit; + int nr_max; + int nr_scaled; +}; +extern void sched_get_nr_running_avg(struct sched_avg_stats *stats); + +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern int group_balance_cpu_not_isolated(struct sched_group *sg); +#else +static inline int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + return group_balance_cpu(sg); +} +#endif /* CONFIG_SCHED_WALT */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, + bool migrate_pinned_tasks); +#ifdef CONFIG_SCHED_WALT +extern void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks); +extern void attach_tasks_core(struct list_head *tasks, struct rq *rq); +#else +static inline void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ +} +static inline void attach_tasks_core(struct list_head *tasks, struct rq *rq) {} +#endif +#endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 000000000000..be968752bd8b --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,194 @@ +/* Copyright (c) 2012, 2015-2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * Scheduler hook for average runqueue determination + */ +#include +#include +#include +#include +#include + +#include "sched.h" +#include "walt.h" +#include + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define NR_THRESHOLD_PCT 15 + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(struct sched_avg_stats *stats) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 period = curr_time - last_get_time; + u64 tmp_nr, tmp_misfit; + + if (!period) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + u64 diff; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_nr = per_cpu(nr_prod_sum, cpu); + tmp_nr += per_cpu(nr, cpu) * diff; + tmp_nr = div64_u64((tmp_nr * 100), period); + + tmp_misfit = per_cpu(nr_big_prod_sum, cpu); + tmp_misfit = div64_u64((tmp_misfit * 100), period); + + /* + * NR_THRESHOLD_PCT is to make sure that the task ran + * at least 85% in the last window to compensate any + * over estimating being done. + */ + stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), + 100); + stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + + NR_THRESHOLD_PCT), 100); + stats[cpu].nr_max = per_cpu(nr_max, cpu); + + trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, + stats[cpu].nr_misfit, stats[cpu].nr_max); + + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + last_get_time = curr_time; + +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > + capacity_orig_of(cpu)) + load_trigger = true; + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +/* + * Returns the CPU utilization % in the last window. + * + */ +unsigned int sched_get_cpu_util(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 util; + unsigned long capacity, flags; + unsigned int busy; + + raw_spin_lock_irqsave(&rq->lock, flags); + + util = rq->cfs.avg.util_avg; + capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = rq->prev_runnable_sum; + util = div64_u64(util, + sched_ravg_window >> SCHED_CAPACITY_SHIFT); + } +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + util = (util >= capacity) ? capacity : util; + busy = div64_ul((util * 100), capacity); + return busy; +} + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 004e9505f7ad..24f9f092a574 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1220,16 +1220,25 @@ build_sched_groups(struct sched_domain *sd, int cpu) * group having more cpu_capacity will pickup more load compared to the * group having less cpu_capacity. */ -static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) +void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_WALT + cpumask_t avail_mask; +#endif WARN_ON(!sg); do { int cpu, max_cpu = -1; +#ifdef CONFIG_SCHED_WALT + cpumask_andnot(&avail_mask, sched_group_span(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); +#else sg->group_weight = cpumask_weight(sched_group_span(sg)); +#endif if (!(sd->flags & SD_ASYM_PACKING)) goto next; diff --git a/kernel/smp.c b/kernel/smp.c index f73a597c8e4c..92742aa1e348 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -957,7 +957,8 @@ void wake_up_all_idle_cpus(void) if (cpu == smp_processor_id()) continue; - wake_up_if_idle(cpu); + if (!cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 890b79cf0e7c..51c1fe80a9f6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ -static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ef90718c114..30abe5f4ce4e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1004,12 +1004,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -1024,19 +1021,25 @@ static void __remove_hrtimer(struct hrtimer *timer, */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + WRITE_ONCE(timer->state, newstate | (timer->state & HRTIMER_STATE_PINNED)); } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - bool restart, bool keep_local) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - bool reprogram; + int reprogram; /* * Remove the timer and force reprogramming when high @@ -1049,18 +1052,11 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * If the timer is not restarted then reprogramming is - * required if the timer is local. If it is local and about - * to be restarted, avoid programming it twice (on removal - * and a moment later when it's requeued). - */ if (!restart) state = HRTIMER_STATE_INACTIVE; - else - reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1112,31 +1108,9 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; - bool force_local, first; - /* - * If the timer is on the local cpu base and is the first expiring - * timer then this might end up reprogramming the hardware twice - * (on removal and on enqueue). To avoid that by prevent the - * reprogram on removal, keep the timer local to the current CPU - * and enforce reprogramming after it is queued no matter whether - * it is the new first expiring timer again or not. - */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - force_local &= base->cpu_base->next_timer == timer; - - /* - * Remove an active timer from the queue. In case it is not queued - * on the current CPU, make sure that remove_hrtimer() updates the - * remote data correctly. - * - * If it's on the current CPU and the first expiring timer, then - * skip reprogramming, keep the timer local and enforce - * reprogramming later if it was the first expiring timer. This - * avoids programming the underlying clock event twice (once at - * removal and once after enqueue). - */ - remove_hrtimer(timer, base, true, force_local); + /* Remove an active timer from the queue: */ + remove_hrtimer(timer, base, true); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1146,24 +1120,13 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - if (!force_local) { - new_base = switch_hrtimer_base(timer, base, - mode & HRTIMER_MODE_PINNED); - } else { - new_base = base; - } + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; - /* - * Timer was forced to stay on the current CPU to avoid - * reprogramming on removal and enqueue. Force reprogram the - * hardware by evaluating the new first expiring timer. - */ - hrtimer_force_reprogram(new_base->cpu_base, 1); - return 0; + return enqueue_hrtimer(timer, new_base, mode); } /** @@ -1229,7 +1192,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false, false); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); @@ -2083,14 +2046,21 @@ int hrtimers_prepare_cpu(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + bool is_hotplug = !cpu_online(old_base->cpu_base->cpu); + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); - BUG_ON(hrtimer_callback_running(timer)); + if (is_hotplug) + BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* @@ -2099,6 +2069,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -2110,23 +2087,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base, HRTIMER_MODE_ABS); + } } -int hrtimers_dead_cpu(unsigned int scpu) +static void __migrate_hrtimers(unsigned int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -2138,7 +2115,7 @@ int hrtimers_dead_cpu(unsigned int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } /* @@ -2152,11 +2129,30 @@ int hrtimers_dead_cpu(unsigned int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} + +int hrtimers_dead_cpu(unsigned int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); + __migrate_hrtimers(scpu, true); local_bh_enable(); return 0; } +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); +} + #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a3ec21be3b14..b09a20c2502d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1921,14 +1921,20 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - detach_timer(timer, false); + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED; + if (!remove_pinned && is_pinned) + continue; + + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } @@ -1949,14 +1955,13 @@ int timers_prepare_cpu(unsigned int cpu) return 0; } -int timers_dead_cpu(unsigned int cpu) +static void __migrate_timers(unsigned int cpu, bool remove_pinned) { struct timer_base *old_base; struct timer_base *new_base; + unsigned long flags; int b, i; - BUG_ON(cpu_online(cpu)); - for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); @@ -1964,7 +1969,7 @@ int timers_dead_cpu(unsigned int cpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock_irq(&new_base->lock); + raw_spin_lock_irqsave(&new_base->lock, flags); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); /* @@ -1973,18 +1978,31 @@ int timers_dead_cpu(unsigned int cpu) */ forward_timer_base(new_base); - BUG_ON(old_base->running_timer); + if (!cpu_online(cpu)) + BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) - migrate_timer_list(new_base, old_base->vectors + i); + migrate_timer_list(new_base, old_base->vectors + i, + remove_pinned); raw_spin_unlock(&old_base->lock); - raw_spin_unlock_irq(&new_base->lock); + raw_spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&timer_bases); } +} + +int timers_dead_cpu(unsigned int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, true); return 0; } +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(unsigned int *)cpup, false); +} + #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 01bf977090dc..e0a05a779f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -171,7 +172,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); -static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(unsigned int, watchdog_en);static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -428,16 +429,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_enable(unsigned int cpu) +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); + unsigned int *enabled = this_cpu_ptr(&watchdog_en); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); + if (*enabled) + return; + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -452,11 +457,24 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); + + /* + * Need to ensure above operations are observed by other CPUs before + * indicating that timer is enabled. This is to synchronize core + * isolation and hotplug. Core isolation will wait for this flag to be + * set. + */ + mb(); + *enabled = 1; } -static void watchdog_disable(unsigned int cpu) +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = per_cpu_ptr(&watchdog_en, cpu); + + if (!*enabled) + return; WARN_ON_ONCE(cpu != smp_processor_id()); @@ -468,6 +486,17 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); + + /* + * No need for barrier here since disabling the watchdog is + * synchronized with hotplug lock + */ + *enabled = 0; +} + +bool watchdog_configured(unsigned int cpu) +{ + return *per_cpu_ptr(&watchdog_en, cpu); } static int softlockup_stop_fn(void *data) @@ -482,7 +511,6 @@ static void softlockup_stop_all(void) if (!softlockup_initialized) return; - for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d1..15560b6b7704 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1863,7 +1863,7 @@ int vmstat_refresh(struct ctl_table *table, int write, static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1955,7 +1955,8 @@ static void vmstat_shepherd(struct work_struct *w) for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!delayed_work_pending(dw) && need_update(cpu) && + !cpu_isolated(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus(); -- Gitee