diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index c7d2510e5a786fec9f77852eb4a285117dcedaad..853c4f81ba4a5761da3135090ece7e9558f581a2 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -13,6 +13,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #endif /* Replace task scheduler's default cpu-invariant accounting */ diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 9fab663dd2de96f350e9a65bde3a6fa8d29586f2..a323b109b9c44ba5fab1f2cfe2359dd6694c3a5a 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -23,6 +23,7 @@ void update_freq_counters_refs(void); #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #ifdef CONFIG_ACPI_CPPC_LIB #define arch_init_invariance_cppc topology_init_cpu_capacity_cppc diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3b9ea46afb5285cb51a2847b54ce2ed4a4df9236..d9a6370c05c597596e65039bb421d61adeb31125 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -136,7 +135,12 @@ int __init parse_acpi_topology(void) #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt -static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale); +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; @@ -166,14 +170,14 @@ static inline bool freq_counters_valid(int cpu) return true; } -static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +void freq_inv_set_max_ratio(int cpu, u64 max_rate) { - u64 ratio; + u64 ratio, ref_rate = arch_timer_get_rate(); if (unlikely(!max_rate || !ref_rate)) { - pr_debug("CPU%d: invalid maximum or reference frequency.\n", + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", cpu); - return -EINVAL; + return; } /* @@ -193,12 +197,10 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) ratio = div64_u64(ratio, max_rate); if (!ratio) { WARN_ONCE(1, "Reference frequency too low.\n"); - return -EINVAL; + return; } - per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio; - - return 0; + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); } static void amu_scale_freq_tick(void) @@ -240,55 +242,28 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; -static void amu_fie_setup(const struct cpumask *cpus) +static void amu_fie_setup(unsigned int cpu) { - int cpu; - - /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_test_cpu(cpu, amu_fie_cpus)) return; - for_each_cpu(cpu, cpus) { - if (!freq_counters_valid(cpu) || - freq_inv_set_max_ratio(cpu, - cpufreq_get_hw_max_freq(cpu) * 1000ULL, - arch_timer_get_rate())) - return; - } + if (!freq_counters_valid(cpu)) + return; - cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); + cpumask_set_cpu(cpu, amu_fie_cpus); topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); - pr_debug("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(cpus)); + pr_debug("CPUs[%u]: counters will be used for FIE.", cpu); } -static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, - void *data) +static int cpuhp_topology_online(unsigned int cpu) { - struct cpufreq_policy *policy = data; - - if (val == CPUFREQ_CREATE_POLICY) - amu_fie_setup(policy->related_cpus); - - /* - * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU - * counters don't have any dependency on cpufreq driver once we have - * initialized AMU support and enabled invariance. The AMU counters will - * keep on working just fine in the absence of the cpufreq driver, and - * for the CPUs for which there are no counters available, the last set - * value of arch_freq_scale will remain valid as that is the frequency - * those CPUs are running at. - */ + amu_fie_setup(cpu); return 0; } -static struct notifier_block init_amu_fie_notifier = { - .notifier_call = init_amu_fie_callback, -}; - static int __init init_amu_fie(void) { int ret; @@ -296,12 +271,16 @@ static int __init init_amu_fie(void) if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) return -ENOMEM; - ret = cpufreq_register_notifier(&init_amu_fie_notifier, - CPUFREQ_POLICY_NOTIFIER); - if (ret) + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/topology:online", + cpuhp_topology_online, + NULL); + if (ret < 0) { free_cpumask_var(amu_fie_cpus); + return ret; + } - return ret; + return 0; } core_initcall(init_amu_fie); diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h index e316ab3b77f341ced3d4e3adf3c2e492a1feaaf0..61183688bdd54edf247a94d5c77258cd96d248b5 100644 --- a/arch/riscv/include/asm/topology.h +++ b/arch/riscv/include/asm/topology.h @@ -9,6 +9,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index b776ba4127db717f1434b109508a7813f9620120..24482158c1e628721a1f7d1f5a3b4e4d223a41dd 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -21,6 +21,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -28,7 +29,8 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; -static DEFINE_PER_CPU(u32, freq_factor) = 1; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -172,9 +174,9 @@ DEFINE_PER_CPU(unsigned long, thermal_pressure); * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to * thermal capping. It might be also a boost frequency value, which is bigger - * than the internal 'freq_factor' max frequency. In such case the pressure - * value should simply be removed, since this is an indication that there is - * no thermal throttling. The @capped_freq must be provided in kHz. + * than the internal 'capacity_freq_ref' max frequency. In such case the + * pressure value should simply be removed, since this is an indication that + * there is no thermal throttling. The @capped_freq must be provided in kHz. */ void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq) @@ -185,10 +187,7 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, cpu = cpumask_first(cpus); max_capacity = arch_scale_cpu_capacity(cpu); - max_freq = per_cpu(freq_factor, cpu); - - /* Convert to MHz scale which is used in 'freq_factor' */ - capped_freq /= 1000; + max_freq = arch_scale_freq_ref(cpu); /* * Handle properly the boost frequencies, which should simply clean @@ -300,13 +299,13 @@ void topology_normalize_cpu_scale(void) capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); @@ -342,15 +341,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) cpu_node, raw_capacity[cpu]); /* - * Update freq_factor for calculating early boot cpu capacities. + * Update capacity_freq_ref for calculating early boot CPU capacities. * For non-clk CPU DVFS mechanism, there's no way to get the * frequency value now, assuming they are running at the same - * frequency (by keeping the initial freq_factor value). + * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); if (!PTR_ERR_OR_ZERO(cpu_clk)) { - per_cpu(freq_factor, cpu) = - clk_get_rate(cpu_clk) / 1000; + per_cpu(capacity_freq_ref, cpu) = + clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); } } else { @@ -366,11 +365,16 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) return !ret; } +void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate) +{ +} + #ifdef CONFIG_ACPI_CPPC_LIB #include void topology_init_cpu_capacity_cppc(void) { + u64 capacity, capacity_scale = 0; struct cppc_perf_caps perf_caps; int cpu; @@ -387,6 +391,10 @@ void topology_init_cpu_capacity_cppc(void) (perf_caps.highest_perf >= perf_caps.nominal_perf) && (perf_caps.highest_perf >= perf_caps.lowest_perf)) { raw_capacity[cpu] = perf_caps.highest_perf; + capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]); + + per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n", cpu, raw_capacity[cpu]); continue; @@ -397,7 +405,18 @@ void topology_init_cpu_capacity_cppc(void) goto exit; } - topology_normalize_cpu_scale(); + for_each_possible_cpu(cpu) { + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + + capacity = raw_capacity[cpu]; + capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, + capacity_scale); + topology_set_cpu_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, topology_get_cpu_scale(cpu)); + } + schedule_work(&update_topology_flags_work); pr_debug("cpu_capacity: cpu_capacity initialization done\n"); @@ -431,8 +450,11 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); - for_each_cpu(cpu, policy->related_cpus) - per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000; + for_each_cpu(cpu, policy->related_cpus) { + per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + } if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 986d0fcce01f8e8022e094cc9deacc74105cdf12..c317a8841f6c7bf4804c0f478794bcfe6488f16d 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -113,12 +113,23 @@ static void cppc_scale_freq_workfn(struct kthread_work *work) struct cppc_cpudata *cpu_data; unsigned long local_freq_scale; u64 perf; + int ret; cppc_fi = container_of(work, struct cppc_freq_invariance, work); cpu_data = cppc_fi->cpu_data; - if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + ret = cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs); + /* + * Perf counters could be 0 if the cpu is in a low-power idle state. + * Just try it again next time. + */ + if (ret == -EFAULT) + return; + + if (ret) { pr_warn("%s: failed to read perf counters\n", __func__); + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, + cpu_data->shared_cpu_map); return; } @@ -176,16 +187,14 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) init_irq_work(&cppc_fi->irq_work, cppc_irq_work); ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); - if (ret) { - pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", - __func__, cpu, ret); - + if (ret && cpu_online(cpu)) { /* * Don't abort if the CPU was offline while the driver * was getting registered. */ - if (cpu_online(cpu)) - return; + pr_debug("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + return; } } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f02326e1b2f29aba873ee0263c26aa7ff88b9087..bbac0892651e940333f3ee3692837896cb18d2ad 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -454,7 +454,7 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy, arch_set_freq_scale(policy->related_cpus, policy->cur, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); spin_lock(&policy->transition_lock); policy->transition_ongoing = false; @@ -2201,7 +2201,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, policy->cur = freq; arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); cpufreq_stats_record_transition(policy, freq); if (trace_cpu_frequency_enabled()) { diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a07b510e7dc55960e83dca5542f9e937925c66d9..a63d61ca55afc82ec865de9de60d8fe667964fef 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) @@ -92,6 +99,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9e1fb9a5480bb3cc192d71787645ee786fbf2166..9be837765a9924fbe4a15e76a7747596988e05a8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1231,6 +1231,7 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif + /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 0091bf129f772d41341b8f4c26cf7163c818eb3d..784efdba67037adc651d63d2b65b8ae08921b20e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -229,7 +229,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int cpu; @@ -246,11 +246,11 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; + ref_freq = arch_scale_freq_ref(cpu); max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7f37b5caad42f8cc6d0df14bad24fe535ffb14b6..7eee852aa3847ff7e89e2bdd7a098b94d46c7941 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -306,6 +306,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a49f136014ce6b7a6e43a9d663a93c90fea8ec51..0fd07530d49555dbd11b69557e01bb95901db0f8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,6 +114,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy) } } +/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + return policy->cur; +} + /** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. @@ -140,9 +162,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); util = map_util_perf(util); freq = map_util_freq(util, freq, max);