diff --git a/Documentation/virt/kvm/arm/pvsched.rst b/Documentation/virt/kvm/arm/pvsched.rst new file mode 100644 index 0000000000000000000000000000000000000000..6ba221e25089ed8b1de0098472a9c5cdbae059f7 --- /dev/null +++ b/Documentation/virt/kvm/arm/pvsched.rst @@ -0,0 +1,74 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Paravirtualized sched support for arm64 +======================================= + +KVM/arm64 provides some hypervisor service calls to support a paravirtualized +sched. + +Some SMCCC compatible hypercalls are defined: + +* PV_SCHED_FEATURES: 0xC5000090 +* PV_SCHED_IPA_INIT: 0xC5000091 +* PV_SCHED_IPA_RELEASE: 0xC5000092 +* PV_SCHED_KICK_CPU: 0xC5000093 + +The existence of the PV_SCHED hypercall should be probed using the SMCCC 1.1 +ARCH_FEATURES mechanism before calling it. + +PV_SCHED_FEATURES + ============= ======== ========== + Function ID: (uint32) 0xC5000090 + PV_call_id: (uint32) The function to query for support. + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the relevant + PV-sched feature is supported by the hypervisor. + ============= ======== ========== + +PV_SCHED_IPA_INIT + ============= ======== ========== + Function ID: (uint32) 0xC5000091 + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the IPA of + this vCPU's PV data structure is shared to the + hypervisor. + ============= ======== ========== + +PV_SCHED_IPA_RELEASE + ============= ======== ========== + Function ID: (uint32) 0xC5000092 + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the IPA of + this vCPU's PV data structure is released. + ============= ======== ========== + +PV_SCHED_KICK_CPU + ============= ======== ========== + Function ID: (uint32) 0xC5000093 + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the vCPU is + kicked by the hypervisor. + ============= ======== ========== + +PV sched state +-------------- + +The structure pointed to by the PV_SCHED_IPA hypercall is as follows: + ++-----------+-------------+-------------+-----------------------------------+ +| Field | Byte Length | Byte Offset | Description | ++===========+=============+=============+===================================+ +| preempted | 4 | 0 | Indicates that the vCPU that owns | +| | | | this struct is running or not. | +| | | | Non-zero values mean the vCPU has | +| | | | been preempted. Zero means the | +| | | | vCPU is not preempted. | ++-----------+-------------+-------------+-----------------------------------+ + +The preempted field will be updated to 0 by the hypervisor prior to scheduling +a vCPU. When the vCPU is scheduled out, the preempted field will be updated +to 1 by the hypervisor. + +A vCPU of a paravirtualized guest that is busywaiting in guest kernel mode for +an event to occur (ex: a spinlock to become available) can execute WFI +instruction once it has busy-waited for more than a threshold time-interval. +Execution of WFI instruction would cause the hypervisor to put the vCPU to sleep +until occurrence of an appropriate event. Another vCPU of the same guest can +wakeup the sleeping vCPU by issuing PV_SCHED_KICK_CPU hypercall, specifying CPU +id (reg1) of the vCPU to be woken up. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8bcd353e333e6b1c7a420a5fe27cb510d2714202..e7b642bda4d55d8c5fbd187133ab459d89a13eb0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1121,6 +1121,19 @@ config PARAVIRT under a hypervisor, potentially improving performance significantly over full virtualization. +config PARAVIRT_SPINLOCKS + bool "Paravirtualization layer for spinlocks" + depends on PARAVIRT && SMP + help + Paravirtualized spinlocks allow a pvops backend to replace the + spinlock implementation with something virtualization-friendly + (for example, block the virtual CPU rather than spinning). + + It has a minimal impact on native kernels and gives a nice performance + benefit on paravirtualized KVM kernels. + + If you are unsure how to answer this question, answer Y. + config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" select PARAVIRT diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index c9a867ac32d48d7cd04c17e499afd7e97727df68..4bb6c766bb2bdee669101231a36ca2b129a9b998 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -24,6 +24,7 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_USER_NS=y +CONFIG_SCHED_STEAL=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y @@ -64,6 +65,8 @@ CONFIG_ARM64_VA_BITS_48=y CONFIG_SCHED_MC=y CONFIG_NUMA=y CONFIG_SECCOMP=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_TIME_ACCOUNTING=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_XEN=y @@ -117,6 +120,11 @@ CONFIG_CRYPTO_AES_ARM64_BS=m CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y + +# About Paravirt Queued Spinlocks +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y + # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_KSM=y CONFIG_MEMORY_FAILURE=y diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 98a5405c855882376e94baecdeaccf4082c60b13..4f286b8da938cec76ecfe2990bb96b7c03138f5a 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -18,7 +18,6 @@ generic-y += mm-arch-hooks.h generic-y += mmiowb.h generic-y += msi.h generic-y += qrwlock.h -generic-y += qspinlock.h generic-y += serial.h generic-y += set_memory.h generic-y += switch_to.h diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a639c7fee3a8c59f8b1ed7cd3d1254c53a6436b7..6c7a8af7fe27213b94fc4485b8dc14850cc22a5c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -340,6 +340,18 @@ struct kvm_vcpu_arch { /* True when deferrable sysregs are loaded on the physical CPU, * see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */ bool sysregs_loaded_on_cpu; + + /* Guest PV state */ + struct { + u64 last_steal; + gpa_t base; + } steal; + + /* Guest PV sched state */ + struct { + bool pv_unhalted; + gpa_t base; + } pvsched; }; /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ @@ -482,6 +494,20 @@ void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int kvm_perf_init(void); int kvm_perf_teardown(void); +long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu); +void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted); +long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu); + +static inline void kvm_arm_pvsched_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) +{ + vcpu_arch->pvsched.base = GPA_INVALID; +} + +static inline bool kvm_arm_is_pvsched_enabled(struct kvm_vcpu_arch *vcpu_arch) +{ + return (vcpu_arch->pvsched.base != GPA_INVALID); +} + void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h index 799d9dd6f7ccb252addeb4f4d5fb99b94370f5f3..0e20346bbb75a5b7a9069d70f5114c3b6263157d 100644 --- a/arch/arm64/include/asm/paravirt.h +++ b/arch/arm64/include/asm/paravirt.h @@ -11,8 +11,19 @@ struct pv_time_ops { unsigned long long (*steal_clock)(int cpu); }; +struct pv_sched_ops { + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + void (*queued_spin_unlock)(struct qspinlock *lock); + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); + + bool (*vcpu_is_preempted)(int cpu); +}; + struct paravirt_patch_template { struct pv_time_ops time; + struct pv_sched_ops sched; }; extern struct paravirt_patch_template pv_ops; @@ -21,6 +32,50 @@ static inline u64 paravirt_steal_clock(int cpu) { return pv_ops.time.steal_clock(cpu); } -#endif + +int __init pv_time_init(void); + +int __init pv_sched_init(void); + +__visible bool __native_vcpu_is_preempted(int cpu); +static inline bool pv_vcpu_is_preempted(int cpu) +{ + return pv_ops.sched.vcpu_is_preempted(cpu); +} + +#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) +void __init pv_qspinlock_init(void); +bool pv_is_native_spin_unlock(void); +static inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + return pv_ops.sched.queued_spin_lock_slowpath(lock, val); +} + +static inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + return pv_ops.sched.queued_spin_unlock(lock); +} + +static inline void pv_wait(u8 *ptr, u8 val) +{ + return pv_ops.sched.wait(ptr, val); +} + +static inline void pv_kick(int cpu) +{ + return pv_ops.sched.kick(cpu); +} +#else + +#define pv_qspinlock_init() do {} while (0) + +#endif /* SMP && PARAVIRT_SPINLOCKS */ + +#else + +#define pv_time_init() do {} while (0) +#define pv_sched_init() do {} while (0) + +#endif // CONFIG_PARAVIRT #endif diff --git a/arch/arm64/include/asm/pvclock-abi.h b/arch/arm64/include/asm/pvclock-abi.h new file mode 100644 index 0000000000000000000000000000000000000000..c4f1c0a0789cd18b78d415c522f9701c560c7959 --- /dev/null +++ b/arch/arm64/include/asm/pvclock-abi.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2019 Arm Ltd. */ + +#ifndef __ASM_PVCLOCK_ABI_H +#define __ASM_PVCLOCK_ABI_H + +/* The below structure is defined in ARM DEN0057A */ + +struct pvclock_vcpu_stolen_time { + __le32 revision; + __le32 attributes; + __le64 stolen_time; + /* Structure must be 64 byte aligned, pad to that size */ + u8 padding[48]; +} __packed; + +#endif diff --git a/arch/arm64/include/asm/pvsched-abi.h b/arch/arm64/include/asm/pvsched-abi.h new file mode 100644 index 0000000000000000000000000000000000000000..e32ddd48b6142f25b64c4658b870d50b5fb41372 --- /dev/null +++ b/arch/arm64/include/asm/pvsched-abi.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#ifndef __ASM_PVSCHED_ABI_H +#define __ASM_PVSCHED_ABI_H + +struct pvsched_vcpu_state { + __le32 preempted; + /* Structure must be 64 byte aligned, pad to that size */ + u8 padding[60]; +} __packed; + +#endif \ No newline at end of file diff --git a/arch/arm64/include/asm/qspinlock.h b/arch/arm64/include/asm/qspinlock.h new file mode 100644 index 0000000000000000000000000000000000000000..fa842bcc7434c728b6d991df89052d82bb4e1164 --- /dev/null +++ b/arch/arm64/include/asm/qspinlock.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2020 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#ifndef _ASM_ARM64_QSPINLOCK_H +#define _ASM_ARM64_QSPINLOCK_H + +#include +#include +#include +#include + +#define _Q_PENDING_LOOPS (1 << 9) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void native_queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release(&lock->locked, 0); +} + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + pv_queued_spin_unlock(lock); +} +#endif + +#include + +#endif /* _ASM_ARM64_QSPINLOCK_H */ diff --git a/arch/arm64/include/asm/qspinlock_paravirt.h b/arch/arm64/include/asm/qspinlock_paravirt.h new file mode 100644 index 0000000000000000000000000000000000000000..eba4be28fbb9d4e286269b7f19f97fe9f3f91f17 --- /dev/null +++ b/arch/arm64/include/asm/qspinlock_paravirt.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +extern void __pv_queued_spin_unlock(struct qspinlock *lock); + +#endif diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index b093b287babf0afbc773914992752d4977d28b61..ab3ab398fb963bbed03a381cca1221aa93dec7ac 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -7,8 +7,34 @@ #include #include +#include + +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 << 15) /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() +/* + * Changing this will break osq_lock() thanks to the call inside + * smp_cond_load_relaxed(). + * + * See: + * https://lore.kernel.org/lkml/20200110100612.GC2827@hirez.programming.kicks-ass.net + */ +#define vcpu_is_preempted vcpu_is_preempted +#ifdef CONFIG_PARAVIRT +static inline bool vcpu_is_preempted(int cpu) +{ + return pv_vcpu_is_preempted(cpu); +} + +#else + +static inline bool vcpu_is_preempted(int cpu) +{ + return false; +} +#endif /* CONFIG_PARAVIRT */ + #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 9763b9f576d8ef663665f943ed1b2b6b1f4d2108..4875d2caa0a4dc3ce83be16e74233869fe984370 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -50,7 +50,8 @@ obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o -obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT_SPINLOCKS) += paravirt.o paravirt-spinlocks.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ diff --git a/arch/arm64/kernel/paravirt-spinlocks.c b/arch/arm64/kernel/paravirt-spinlocks.c new file mode 100644 index 0000000000000000000000000000000000000000..88b9181ac87d9b3258527db5171c7984ca2a26cc --- /dev/null +++ b/arch/arm64/kernel/paravirt-spinlocks.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#include +#include + +__visible bool __native_vcpu_is_preempted(int cpu) +{ + return false; +} + +bool pv_is_native_spin_unlock(void) +{ + return false; +} \ No newline at end of file diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 4cfed91fe256e5c94f6d6386f6673918ee593398..488ea7dd9cb83782726f2ef36eeab57a25a47c63 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -6,13 +6,339 @@ * Author: Stefano Stabellini */ +#define pr_fmt(fmt) "arm-pv: " fmt + +#include +#include #include +#include #include +#include +#include +#include +#include #include + #include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace-paravirt.h" struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; +struct paravirt_patch_template pv_ops = { +#ifdef CONFIG_PARAVIRT_SPINLOCKS + .sched.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .sched.queued_spin_unlock = native_queued_spin_unlock, +#endif + .sched.vcpu_is_preempted = __native_vcpu_is_preempted, +}; EXPORT_SYMBOL_GPL(pv_ops); + +struct pv_time_stolen_time_region { + struct pvclock_vcpu_stolen_time *kaddr; +}; + +static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region); + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +/* return stolen time in ns by asking the hypervisor */ +static u64 pv_steal_clock(int cpu) +{ + struct pv_time_stolen_time_region *reg; + + reg = per_cpu_ptr(&stolen_time_region, cpu); + + /* + * paravirt_steal_clock() may be called before the CPU + * online notification callback runs. Until the callback + * has run we just return zero. + */ + if (!reg->kaddr) + return 0; + + return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time)); +} + +static int stolen_time_cpu_down_prepare(unsigned int cpu) +{ + struct pv_time_stolen_time_region *reg; + + reg = this_cpu_ptr(&stolen_time_region); + if (!reg->kaddr) + return 0; + + memunmap(reg->kaddr); + memset(reg, 0, sizeof(*reg)); + + return 0; +} + +static int stolen_time_cpu_online(unsigned int cpu) +{ + struct pv_time_stolen_time_region *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&stolen_time_region); + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_ST, &res); + + if (res.a0 == SMCCC_RET_NOT_SUPPORTED) + return -EINVAL; + + reg->kaddr = memremap(res.a0, + sizeof(struct pvclock_vcpu_stolen_time), + MEMREMAP_WB); + + if (!reg->kaddr) { + pr_warn("Failed to map stolen time data structure\n"); + return -ENOMEM; + } + + if (le32_to_cpu(reg->kaddr->revision) != 0 || + le32_to_cpu(reg->kaddr->attributes) != 0) { + pr_warn_once("Unexpected revision or attributes in stolen time data\n"); + return -ENXIO; + } + + return 0; +} + +static int __init pv_time_init_stolen_time(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "hypervisor/arm/pvtime:online", + stolen_time_cpu_online, + stolen_time_cpu_down_prepare); + if (ret < 0) + return ret; + return 0; +} + +static bool __init has_pv_steal_clock(void) +{ + struct arm_smccc_res res; + + /* To detect the presence of PV time support we require SMCCC 1.1+ */ + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_HV_PV_TIME_FEATURES, &res); + + if (res.a0 != SMCCC_RET_SUCCESS) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_FEATURES, + ARM_SMCCC_HV_PV_TIME_ST, &res); + + return (res.a0 == SMCCC_RET_SUCCESS); +} + +int __init pv_time_init(void) +{ + int ret; + + if (!has_pv_steal_clock()) { + pr_warn("PV steal time is not available\n"); + return 0; + } + + ret = pv_time_init_stolen_time(); + if (ret) + return ret; + + pv_ops.time.steal_clock = pv_steal_clock; + + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + + pr_info("using stolen time PV\n"); + + return 0; +} + + + +DEFINE_PER_CPU(struct pvsched_vcpu_state, pvsched_vcpu_region) __aligned(64); +EXPORT_PER_CPU_SYMBOL(pvsched_vcpu_region); + +static bool kvm_vcpu_is_preempted(int cpu) +{ + struct pvsched_vcpu_state *reg; + u32 preempted; + + reg = &per_cpu(pvsched_vcpu_region, cpu); + if (!reg) { + pr_warn_once("PV sched enabled but not configured for cpu %d\n", + cpu); + return false; + } + + preempted = le32_to_cpu(READ_ONCE(reg->preempted)); + + return !!preempted; +} + +static int pvsched_vcpu_state_dying_cpu(unsigned int cpu) +{ + struct pvsched_vcpu_state *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&pvsched_vcpu_region); + if (!reg) + return -EFAULT; + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE, &res); + memset(reg, 0, sizeof(*reg)); + + return 0; +} + +static int init_pvsched_vcpu_state(unsigned int cpu) +{ + struct pvsched_vcpu_state *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&pvsched_vcpu_region); + if (!reg) + return -EFAULT; + + /* Pass the memory address to host via hypercall */ + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_SCHED_IPA_INIT, + virt_to_phys(reg), &res); + + return 0; +} + +static int kvm_arm_init_pvsched(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ARM_KVM_PVSCHED_STARTING, + "hypervisor/arm/pvsched:starting", + init_pvsched_vcpu_state, + pvsched_vcpu_state_dying_cpu); + + if (ret < 0) { + pr_warn("PV sched init failed\n"); + return ret; + } + + return 0; +} + +static bool has_kvm_pvsched(void) +{ + struct arm_smccc_res res; + + /* To detect the presence of PV sched support we require SMCCC 1.1+ */ + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_HV_PV_SCHED_FEATURES, &res); + + return (res.a0 == SMCCC_RET_SUCCESS); +} + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +static bool arm_pvspin = false; + +/* Kick a cpu by its cpuid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_SCHED_KICK_CPU, cpu, &res); + + trace_kvm_kick_cpu("kvm kick cpu", smp_processor_id(), cpu); +} + +static void kvm_wait(u8 *ptr, u8 val) +{ + unsigned long flags; + + if (in_nmi()) + return; + + local_irq_save(flags); + + if (READ_ONCE(*ptr) != val) + goto out; + + dsb(sy); + wfi(); + trace_kvm_wait("kvm wait wfi", smp_processor_id()); + +out: + local_irq_restore(flags); +} + +void __init pv_qspinlock_init(void) +{ + /* Don't use the PV qspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + arm_pvspin = false; + + if (!arm_pvspin) { + pr_info("PV qspinlocks disabled\n"); + return; + } + pr_info("PV qspinlocks enabled\n"); + + __pv_init_lock_hash(); + pv_ops.sched.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.sched.queued_spin_unlock = __pv_queued_spin_unlock; + pv_ops.sched.wait = kvm_wait; + pv_ops.sched.kick = kvm_kick_cpu; +} + +static __init int arm_parse_pvspin(char *arg) +{ + arm_pvspin = true; + return 0; +} +early_param("arm_pvspin", arm_parse_pvspin); +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +int __init pv_sched_init(void) +{ + int ret; + + if (is_hyp_mode_available()) + return 0; + + if (!has_kvm_pvsched()) { + pr_warn("PV sched is not available\n"); + return 0; + } + + ret = kvm_arm_init_pvsched(); + if (ret) + return ret; + + pv_ops.sched.vcpu_is_preempted = kvm_vcpu_is_preempted; + pr_info("using PV sched preempted\n"); + + pv_qspinlock_init(); + + return 0; +} +early_initcall(pv_sched_init); \ No newline at end of file diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 0b2946414dc9ccc6fdb2b62b4a0de58b239c7fa2..1b66379ceef24e409faf78e72e4acb59690528bc 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -65,4 +65,6 @@ void __init time_init(void) /* Calibrate the delay loop directly */ lpj_fine = arch_timer_rate / HZ; + + pv_time_init(); } diff --git a/arch/arm64/kernel/trace-paravirt.h b/arch/arm64/kernel/trace-paravirt.h new file mode 100644 index 0000000000000000000000000000000000000000..2d76272f39ae477bd863f9868adc27f2f0f268cf --- /dev/null +++ b/arch/arm64/kernel/trace-paravirt.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM paravirt + +#if !defined(_TRACE_PARAVIRT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PARAVIRT_H + +#include + +TRACE_EVENT(kvm_kick_cpu, + TP_PROTO(const char *name, int cpu, int target), + TP_ARGS(name, cpu, target), + + TP_STRUCT__entry( + __string(name, name) + __field(int, cpu) + __field(int, target) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->cpu = cpu; + __entry->target = target; + ), + + TP_printk("PV qspinlock: %s, cpu %d kick target cpu %d", + __get_str(name), + __entry->cpu, + __entry->target + ) +); + +TRACE_EVENT(kvm_wait, + TP_PROTO(const char *name, int cpu), + TP_ARGS(name, cpu), + + TP_STRUCT__entry( + __string(name, name) + __field(int, cpu) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->cpu = cpu; + ), + + TP_printk("PV qspinlock: %s, cpu %d wait kvm access wfi", + __get_str(name), + __entry->cpu + ) +); + +#endif /* _TRACE_PARAVIRT_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH ../../../arch/arm64/kernel/ +#define TRACE_INCLUDE_FILE trace-paravirt + +#include diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 3ac1a64d2fb9d736e5c2a2bfa778f450a13bee0b..da5f170c916d98466b19674eacf3d78b6a7f6a49 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o +kvm-$(CONFIG_KVM_ARM_HOST) += hypercalls.o pvsched.o kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 1249f68a9418174edfdda3e8c0aca83c34da3733..5f49476ef9c6ca59b6e0fa17d7e8998cff77fc03 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -22,6 +22,8 @@ #include #include +#include + #define CREATE_TRACE_POINTS #include "trace.h" @@ -96,6 +98,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) } else { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; + vcpu->arch.pvsched.pv_unhalted = false; kvm_vcpu_block(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c new file mode 100644 index 0000000000000000000000000000000000000000..e81cf3186ad7c62d0df303729b549bd09308eaf2 --- /dev/null +++ b/arch/arm64/kvm/hypercalls.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include +#include + +#include + +#include +#include + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +{ + u32 func_id = smccc_get_function(vcpu); + u32 val = SMCCC_RET_NOT_SUPPORTED; + u32 feature; + gpa_t gpa; + + switch (func_id) { + case ARM_SMCCC_VERSION_FUNC_ID: + val = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + feature = smccc_get_arg1(vcpu); + switch(feature) { + case ARM_SMCCC_ARCH_WORKAROUND_1: + switch (kvm_arm_harden_branch_predictor()) { + case KVM_BP_HARDEN_UNKNOWN: + break; + case KVM_BP_HARDEN_WA_NEEDED: + val = SMCCC_RET_SUCCESS; + break; + case KVM_BP_HARDEN_NOT_REQUIRED: + val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_2: + switch (kvm_arm_have_ssbd()) { + case KVM_SSBD_FORCE_DISABLE: + case KVM_SSBD_UNKNOWN: + break; + case KVM_SSBD_KERNEL: + val = SMCCC_RET_SUCCESS; + break; + case KVM_SSBD_FORCE_ENABLE: + case KVM_SSBD_MITIGATED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_3: + switch (kvm_arm_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + break; + case SPECTRE_MITIGATED: + val = SMCCC_RET_SUCCESS; + break; + case SPECTRE_UNAFFECTED: + val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + break; + } + break; + case ARM_SMCCC_HV_PV_SCHED_FEATURES: + val = SMCCC_RET_SUCCESS; + break; + } + break; + case ARM_SMCCC_HV_PV_SCHED_FEATURES: + val = kvm_hypercall_pvsched_features(vcpu); + break; + case ARM_SMCCC_HV_PV_SCHED_IPA_INIT: + gpa = smccc_get_arg1(vcpu); + if (gpa != GPA_INVALID) { + vcpu->arch.pvsched.base = gpa; + val = SMCCC_RET_SUCCESS; + } + break; + case ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE: + vcpu->arch.pvsched.base = GPA_INVALID; + val = SMCCC_RET_SUCCESS; + break; + case ARM_SMCCC_HV_PV_SCHED_KICK_CPU: + val = kvm_pvsched_kick_vcpu(vcpu); + break; + default: + return kvm_psci_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/arch/arm64/kvm/pvsched.c b/arch/arm64/kvm/pvsched.c new file mode 100644 index 0000000000000000000000000000000000000000..b4e35d095c11f4a2d419e6a888ea73649bbf4d4d --- /dev/null +++ b/arch/arm64/kvm/pvsched.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#include +#include + +#include + +#include + +void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted) +{ + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.pvsched.base; + u64 offset = offsetof(struct pvsched_vcpu_state, preempted); + int idx; + + if (base == GPA_INVALID) + return; + + /* + * This function is called from atomic context, so we need to + * disable page faults. + */ + pagefault_disable(); + + idx = srcu_read_lock(&kvm->srcu); + kvm_put_guest(kvm, base + offset, cpu_to_le32(preempted)); + srcu_read_unlock(&kvm->srcu, idx); + + pagefault_enable(); +} + +long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu) +{ + unsigned int vcpu_idx; + long val = SMCCC_RET_NOT_SUPPORTED; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *target = NULL; + + vcpu_idx = smccc_get_arg1(vcpu); + target = kvm_get_vcpu(kvm, vcpu_idx); + if (!target) + goto out; + + target->arch.pvsched.pv_unhalted = true; + kvm_make_request(KVM_REQ_IRQ_PENDING, target); + kvm_vcpu_kick(target); + if (READ_ONCE(target->ready)) + kvm_vcpu_yield_to(target); + + val = SMCCC_RET_SUCCESS; + +out: + return val; +} + +long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu) +{ + u32 feature = smccc_get_arg1(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + + switch (feature) { + case ARM_SMCCC_HV_PV_SCHED_FEATURES: + case ARM_SMCCC_HV_PV_SCHED_IPA_INIT: + case ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE: + case ARM_SMCCC_HV_PV_SCHED_KICK_CPU: + val = SMCCC_RET_SUCCESS; + break; + } + + return val; +} + diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index eb797081d159647e06899cb0dbe9c340f6a142f8..47445b9dd8cd35313e40d3d8eb044a634d8f74e3 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -409,7 +409,7 @@ static void __init psci_init_smccc(void) if (feature != PSCI_RET_NOT_SUPPORTED) { u32 ret; ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0); - if (ret == ARM_SMCCC_VERSION_1_1) { + if (ret >= ARM_SMCCC_VERSION_1_1) { psci_ops.smccc_version = SMCCC_VERSION_1_1; ver = ret; } diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h new file mode 100644 index 0000000000000000000000000000000000000000..37293e317e9ae784540db6e819cc3ffdce5395d6 --- /dev/null +++ b/include/kvm/arm_hypercalls.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2019 Arm Ltd. */ + +#ifndef __KVM_ARM_HYPERCALLS_H +#define __KVM_ARM_HYPERCALLS_H + +#include + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu); + +static inline u32 smccc_get_function(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 0); +} + +static inline unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 1); +} + +static inline unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 2); +} + +static inline unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 3); +} + +static inline void smccc_set_retval(struct kvm_vcpu *vcpu, + unsigned long a0, + unsigned long a1, + unsigned long a2, + unsigned long a3) +{ + vcpu_set_reg(vcpu, 0, a0); + vcpu_set_reg(vcpu, 1, a1); + vcpu_set_reg(vcpu, 2, a2); + vcpu_set_reg(vcpu, 3, a3); +} + +#endif \ No newline at end of file diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index 632e78bdef4dfdebc664daf1b2811a8f285ba212..4a45ba7775251670ada061c97693451d4c55be08 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -39,8 +39,7 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm) return KVM_ARM_PSCI_0_1; } - -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu); +int kvm_psci_call(struct kvm_vcpu *vcpu); struct kvm_one_reg; diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 3e6ef64e74d3ddaa9c6de0515cec8b832401c8e5..7cb2ab726b83147e38fdbbabcab54f92b9a1db95 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -45,6 +45,7 @@ #define ARM_SMCCC_OWNER_SIP 2 #define ARM_SMCCC_OWNER_OEM 3 #define ARM_SMCCC_OWNER_STANDARD 4 +#define ARM_SMCCC_OWNER_STANDARD_HYP 5 #define ARM_SMCCC_OWNER_TRUSTED_APP 48 #define ARM_SMCCC_OWNER_TRUSTED_APP_END 49 #define ARM_SMCCC_OWNER_TRUSTED_OS 50 @@ -383,5 +384,30 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, ARM_SMCCC_OWNER_STANDARD_HYP, \ 0x21) +/* Paravirtualised sched calls */ +#define ARM_SMCCC_HV_PV_SCHED_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x90) + +#define ARM_SMCCC_HV_PV_SCHED_IPA_INIT \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x91) + +#define ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x92) + +#define ARM_SMCCC_HV_PV_SCHED_KICK_CPU \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x93) + #endif /*__ASSEMBLY__*/ #endif /*__LINUX_ARM_SMCCC_H*/ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 15835f37bd5f2c2dbfa791c29a59bb7675ec0584..5e35e3a51b2c93ba575379460f15f3459491da09 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -139,6 +139,7 @@ enum cpuhp_state { /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, + CPUHP_AP_ARM_KVM_PVSCHED_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3e80b0bdfdead33e41e1318b0776e6c332ad1bcc..99fa04d5b643c8e38b8ea66017087cebb98e7720 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -769,6 +769,29 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); + +#define __kvm_put_guest(kvm, gfn, offset, v) \ +({ \ + unsigned long __addr = gfn_to_hva(kvm, gfn); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ + int __ret = -EFAULT; \ + \ + if (!kvm_is_error_hva(__addr)) \ + __ret = put_user(v, __uaddr); \ + if (!__ret) \ + mark_page_dirty(kvm, gfn); \ + __ret; \ +}) + +#define kvm_put_guest(kvm, gpa, v) \ +({ \ + gpa_t __gpa = gpa; \ + struct kvm *__kvm = kvm; \ + \ + __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ + offset_in_page(__gpa), v); \ +}) + int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 2382cb58969d70106e32187f058ed005654e34fb..68e84cf42a3f94f2bcb6bf1b35d494f34ddeedad 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -35,6 +35,8 @@ typedef unsigned long gva_t; typedef u64 gpa_t; typedef u64 gfn_t; +#define GPA_INVALID (~(gpa_t)0) + typedef unsigned long hva_t; typedef u64 hpa_t; typedef u64 hfn_t; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a29eacf1f6e71dd8a6470eecc2ba804145d7cf40..3e987f1cd91ec2b43b501ffd432fb03c936036e3 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -89,6 +89,9 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; +#ifdef CONFIG_SCHED_STEAL + struct sparsemask *cfs_overload_cpus; +#endif #ifndef __GENKSYMS__ int nr_idle_scan; #endif diff --git a/init/Kconfig b/init/Kconfig index 2542f4aeba581dd68b50b158846fe11e5d2b50cf..650293f0c71fbc561fb3651bf38f730e7f438229 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1251,6 +1251,22 @@ config SECURITY_MONITOR help Allow user to add security monitor +config SCHED_STEAL + bool "Steal tasks to improve CPU utilization" + depends on SMP + default n + help + When a CPU has no more CFS tasks to run, and idle_balance() fails + to find a task, then attempt to steal a task from an overloaded + CPU in the same LLC. Maintain and use a bitmap of overloaded CPUs + to efficiently identify candidates. To minimize search time, steal + the first migratable task that is found when the bitmap is traversed. + For fairness, search for migratable tasks on an overloaded CPU in + order of next to run. + + If unsure, say N here. + + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc5f92ac8111a3349ec69a5b176adef5a7577790..ce4a8f547f6c18a0932ab89b422129b7ed623304 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3326,17 +3326,48 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, DEFINE_STATIC_KEY_FALSE(sched_schedstats); static bool __initdata __sched_schedstats = false; +#ifdef CONFIG_SCHED_STEAL +unsigned long schedstat_skid; + +static void compute_skid(void) +{ + int i, n = 0; + s64 t; + int skid = 0; + + for (i = 0; i < 100; i++) { + t = local_clock(); + t = local_clock() - t; + if (t > 0 && t < 1000) { /* only use sane samples */ + skid += (int) t; + n++; + } + } + + if (n > 0) + schedstat_skid = skid / n; + else + schedstat_skid = 0; + pr_info("schedstat_skid = %lu\n", schedstat_skid); +} +#else +static inline void compute_skid(void) {} +#endif + static void set_schedstats(bool enabled) { - if (enabled) + if (enabled) { + compute_skid(); static_branch_enable(&sched_schedstats); - else + } else { static_branch_disable(&sched_schedstats); + } } void force_schedstat_enabled(void) { if (!schedstat_enabled()) { + compute_skid(); pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); static_branch_enable(&sched_schedstats); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8abbf625b11b7387ebba91c11efde22d8038ffb..a3e8821a5386de15e17c479dc0816e5f740eac36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,9 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL +#include "sparsemask.h" +#endif #include "fair.h" #include @@ -3794,6 +3797,69 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } +static inline void rq_idle_stamp_update(struct rq *rq) +{ + rq->idle_bt_stamp = rq_clock(rq); +} + +static inline void rq_idle_stamp_clear(struct rq *rq) +{ + rq->idle_bt_stamp = 0; +} + +#ifdef CONFIG_SCHED_STEAL + +static inline bool steal_enabled(void) +{ +#ifdef CONFIG_NUMA + bool allow = static_branch_likely(&sched_steal_allow); +#else + bool allow = true; +#endif + return sched_feat(STEAL) && allow; +} + +static void overload_clear(struct rq *rq) +{ + struct sparsemask *overload_cpus; + unsigned long time; + + if (!steal_enabled()) + return; + + time = schedstat_start_time(); + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_clear_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); + schedstat_end_time(rq, time); +} + +static void overload_set(struct rq *rq) +{ + struct sparsemask *overload_cpus; + unsigned long time; + + if (!steal_enabled()) + return; + + time = schedstat_start_time(); + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_set_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); + schedstat_end_time(rq, time); +} + +static int try_steal(struct rq *this_rq, struct rq_flags *rf); +#else +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} +#endif + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3817,6 +3883,15 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +{ + return 0; +} + +static inline void rq_idle_stamp_update(struct rq *rq) {} +static inline void rq_idle_stamp_clear(struct rq *rq) {} + +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} @@ -3929,6 +4004,20 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = max_vruntime(se->vruntime, vruntime); } +#ifdef CONFIG_SCHED_STEAL +#define SET_STAT(STAT) \ + do { \ + if (schedstat_enabled()) { \ + struct rq *rq = this_rq(); \ + \ + if (rq) \ + __schedstat_inc(rq->STAT); \ + } \ + } while (0) +#else +#define SET_STAT(STAT) +#endif + static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline void check_schedstat_required(void) @@ -6753,12 +6842,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { + unsigned long time; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + time = schedstat_start_time(); + /* * required for stable ->cpus_allowed */ @@ -6810,6 +6902,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f current->recent_used_cpu = cpu; } rcu_read_unlock(); + schedstat_end_time(cpu_rq(cpu), time); return new_cpu; } @@ -7219,6 +7312,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; + unsigned long time; again: if (!sched_fair_runnable(rq)) @@ -7342,12 +7436,26 @@ done: __maybe_unused; if (!rf) return NULL; + time = schedstat_start_time(); + + /* + * We must set idle_stamp _before_ calling try_steal() or + * idle_balance(), such that we measure the duration as idle time. + */ + rq_idle_stamp_update(rq); + new_tasks = newidle_balance(rq, rf); + if (new_tasks == 0) + new_tasks = try_steal(rq, rf); + schedstat_end_time(rq, time); + + if (new_tasks) + rq_idle_stamp_clear(rq); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. + * Because try_steal() and idle_balance() release (and re-acquire) + * rq->lock, it is possible for any higher priority task to appear. + * In that case we must re-start the pick_next_entity() loop. */ if (new_tasks < 0) return RETRY_TASK; @@ -7779,15 +7887,45 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } +#ifdef CONFIG_SCHED_STEAL +/* + * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. + * No need to test for co-locality, and no need to test task_hot(), as sharing + * LLC provides cache warmth at that level. + */ +static bool +can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + + lockdep_assert_held(&rq->__lock); + + if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + return false; + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + return false; + } + + if (task_running(rq, p)) { + schedstat_inc(p->se.statistics.nr_failed_migrations_running); + return false; + } + + return true; +} +#endif + /* * detach_task() -- detach the task for the migration specified in env */ -static void detach_task(struct task_struct *p, struct lb_env *env) +static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) { - lockdep_assert_rq_held(env->src_rq); + lockdep_assert_held(&src_rq->__lock); - deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); - set_task_cpu(p, env->dst_cpu); + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); } /* @@ -7807,7 +7945,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue; - detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu); /* * Right now, this is only the second place where @@ -7882,7 +8020,7 @@ static int detach_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu); list_add(&p->se.group_node, &env->tasks); detached++; @@ -10490,6 +10628,157 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } +#ifdef CONFIG_SCHED_STEAL +/* + * Search the runnable tasks in @cfs_rq in order of next to run, and find + * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. + * On success, dequeue the task from @cfs_rq and return it, else return NULL. + */ +static struct task_struct * +detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + struct task_struct *p; + struct rq *rq = rq_of(cfs_rq); + + lockdep_assert_held(&rq_of(cfs_rq)->__lock); + + list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (can_migrate_task_llc(p, rq, dst_rq)) { + detach_task(p, rq, dst_cpu); + return p; + } + } + return NULL; +} + +/* + * Attempt to migrate a CFS task from @src_cpu to @dst_rq. @locked indicates + * whether @dst_rq is already locked on entry. This function may lock or + * unlock @dst_rq, and updates @locked to indicate the locked state on return. + * The locking protocol is based on idle_balance(). + * Returns 1 on success and 0 on failure. + */ +static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + int src_cpu) +{ + struct task_struct *p; + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = cpu_rq(src_cpu); + + if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_unlock(&dst_rq->__lock); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + p = NULL; + else + p = detach_next_task(&src_rq->cfs, dst_rq); + + rq_unlock(src_rq, &rf); + + if (p) { + raw_spin_lock(&dst_rq->__lock); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, p); + stolen = 1; + schedstat_inc(dst_rq->steal); + } + local_irq_restore(rf.flags); + + return stolen; +} + +/* + * Conservative upper bound on the max cost of a steal, in nsecs (the typical + * cost is 1-2 microsec). Do not steal if average idle time is less. + */ +#define SCHED_STEAL_COST 10000 + +/* + * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq, + * and migrate it to @dst_rq. rq_lock is held on entry and return, but + * may be dropped in between. Return 1 on success, 0 on failure, and -1 + * if a task in a different scheduling class has become runnable on @dst_rq. + */ +static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) +{ + int src_cpu; + int dst_cpu = dst_rq->cpu; + bool locked = true; + int stolen = 0; + bool any_overload = false; + struct sparsemask *overload_cpus; + + if (!steal_enabled()) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_rq->avg_idle_bt < SCHED_STEAL_COST) + return 0; + + /* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */ + + rcu_read_lock(); + overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus); + if (!overload_cpus) { + rcu_read_unlock(); + return 0; + } + +#ifdef CONFIG_SCHED_SMT + /* + * First try overloaded CPUs on the same core to preserve cache warmth. + */ + if (static_branch_likely(&sched_smt_present)) { + for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) { + if (sparsemask_test_elem(overload_cpus, src_cpu) && + steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + } +#endif /* CONFIG_SCHED_SMT */ + + /* Accept any suitable task in the LLC */ + + sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) { + if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + any_overload = true; + } + +out: + rcu_read_unlock(); + if (!locked) { + raw_spin_lock(&dst_rq->__lock); + rq_repin_lock(dst_rq, dst_rf); + } + stolen |= (dst_rq->cfs.h_nr_running > 0); + if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) + stolen = -1; + if (!stolen && any_overload) + schedstat_inc(dst_rq->steal_fail); + return stolen; +} +#endif + static void rq_online_fair(struct rq *rq) { update_sysctl(); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 59e83f5a5719d5449360e86b0b863dcf6a716210..7c8b111258d909dd3b36d3ed32fb08577ce5d47e 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -58,6 +58,14 @@ SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) SCHED_FEAT(SIS_UTIL, false) +#ifdef CONFIG_SCHED_STEAL +/* + * Steal a CFS task from another CPU when going idle. + * Improves CPU utilization. + */ +SCHED_FEAT(STEAL, false) +#endif + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eb6a4ffb8103df49ff1e1913229d2f7bc2d45462..e57f1a7d5c61a0f5dd906322f3c389ee2deead32 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -85,6 +85,9 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_STEAL +struct sparsemask; +#endif /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -1152,6 +1155,9 @@ struct rq { #endif struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_STEAL + struct sparsemask *cfs_overload_cpus; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1278,6 +1284,17 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; + +#ifdef CONFIG_SCHED_STEAL + /* Idle search stats */ + unsigned int found_idle_cpu_capacity; + unsigned int found_idle_cpu; + unsigned int found_idle_cpu_easy; + unsigned int nofound_idle_cpu; + unsigned long find_time; + unsigned int steal; + unsigned int steal_fail; +#endif /* CONFIG_SCHED_STEAL */ #endif #ifdef CONFIG_SMP @@ -1828,6 +1845,10 @@ static inline void set_sched_cluster(void) { } #endif #ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_STEAL +extern struct static_key_true sched_steal_allow; +#endif + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h new file mode 100644 index 0000000000000000000000000000000000000000..11948620a1a2b2f428fb1e08775a860a1d7aa230 --- /dev/null +++ b/kernel/sched/sparsemask.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sparsemask.h - sparse bitmap operations + * + * Copyright (c) 2018 Oracle Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __LINUX_SPARSEMASK_H +#define __LINUX_SPARSEMASK_H + +#include +#include +#include + +/* + * A sparsemask is a sparse bitmap. It reduces cache contention vs the usual + * bitmap when many threads concurrently set, clear, and visit elements. For + * each cacheline chunk of the mask, only the first K bits of the first word are + * used, and the remaining bits are ignored, where K is a creation time + * parameter. Thus a sparsemask that can represent a set of N elements is + * approximately (N/K * CACHELINE) bytes in size. + * + * Clients pass and receive element numbers in the public API, and the + * implementation translates them to bit numbers to perform the bitmap + * operations. + */ + +struct sparsemask_chunk { + unsigned long word; /* the significant bits */ +} ____cacheline_aligned_in_smp; + +struct sparsemask { + short nelems; /* current number of elements */ + short density; /* store 2^density elements per chunk */ + struct sparsemask_chunk chunks[0]; /* embedded array of chunks */ +}; + +#define _SMASK_INDEX(density, elem) ((elem) >> (density)) +#define _SMASK_BIT(density, elem) ((elem) & ((1U << (density)) - 1U)) +#define SMASK_INDEX(mask, elem) _SMASK_INDEX((mask)->density, elem) +#define SMASK_BIT(mask, elem) _SMASK_BIT((mask)->density, elem) +#define SMASK_WORD(mask, elem) \ + (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word) + +/* + * sparsemask_next() - Return the next one bit in a bitmap, starting at a + * specified position and wrapping from the last bit to the first, up to but + * not including a specified origin. This is a helper, so do not call it + * directly. + * + * @mask: Bitmap to search. + * @origin: Origin. + * @prev: Previous bit. Start search after this bit number. + * If -1, start search at @origin. + * + * Return: the bit number, else mask->nelems if no bits are set in the range. + */ +static inline int +sparsemask_next(const struct sparsemask *mask, int origin, int prev) +{ + int density = mask->density; + int bits_per_word = 1U << density; + const struct sparsemask_chunk *chunk; + int nelems = mask->nelems; + int next, bit, nbits; + unsigned long word; + + /* Calculate number of bits to be searched. */ + if (prev == -1) { + nbits = nelems; + next = origin; + } else if (prev < origin) { + nbits = origin - prev; + next = prev + 1; + } else { + nbits = nelems - prev + origin - 1; + next = prev + 1; + } + + if (unlikely(next >= nelems)) + return nelems; + + /* + * Fetch and adjust first word. Clear word bits below @next, and round + * @next down to @bits_per_word boundary because later ffs will add + * those bits back. + */ + chunk = &mask->chunks[_SMASK_INDEX(density, next)]; + bit = _SMASK_BIT(density, next); + word = chunk->word & (~0UL << bit); + next -= bit; + nbits += bit; + + while (!word) { + next += bits_per_word; + nbits -= bits_per_word; + if (nbits <= 0) + return nelems; + + if (next >= nelems) { + chunk = mask->chunks; + nbits -= (next - nelems); + next = 0; + } else { + chunk++; + } + word = chunk->word; + } + + next += __ffs(word); + if (next >= origin && prev != -1) + return nelems; + return next; +} + +/****************** The public API ********************/ + +/* + * Max value for the density parameter, limited by 64 bits in the chunk word. + */ +#define SMASK_DENSITY_MAX 6 + +/* + * Return bytes to allocate for a sparsemask, for custom allocators. + */ +static inline size_t sparsemask_size(int nelems, int density) +{ + int index = _SMASK_INDEX(density, nelems) + 1; + + return offsetof(struct sparsemask, chunks[index]); +} + +/* + * Initialize an allocated sparsemask, for custom allocators. + */ +static inline void +sparsemask_init(struct sparsemask *mask, int nelems, int density) +{ + WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0); + mask->nelems = nelems; + mask->density = density; +} + +/* + * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask. + * + * @nelems - maximum number of elements. + * @density - store 2^density elements per cacheline chunk. + * values from 0 to SMASK_DENSITY_MAX inclusive. + * @flags - kmalloc allocation flags + * @node - numa node + */ +static inline struct sparsemask * +sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node) +{ + int nbytes = sparsemask_size(nelems, density); + struct sparsemask *mask = kmalloc_node(nbytes, flags, node); + + if (mask) + sparsemask_init(mask, nelems, density); + return mask; +} + +static inline void sparsemask_free(struct sparsemask *mask) +{ + kfree(mask); +} + +static inline void sparsemask_set_elem(struct sparsemask *dst, int elem) +{ + set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem) +{ + clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem) +{ + return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem)); +} + +/* + * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a + * specified position, and wrapping from the last bit to the first. + * + * @mask: Bitmap to iterate over. + * @origin: Bit number at which to start searching. + * @elem: Iterator. Can be signed or unsigned integer. + * + * The implementation does not assume any bit in @mask is set, including + * @origin. After the loop, @elem = @mask->nelems. + */ +#define sparsemask_for_each(mask, origin, elem) \ + for ((elem) = -1; \ + (elem) = sparsemask_next((mask), (origin), (elem)), \ + (elem) < (mask)->nelems;) + +#endif /* __LINUX_SPARSEMASK_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed27c70c5ae73b07651d8c2e0e2eab..6df040e406ceb02ba3c4d647f7fe0cde78e0795b 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -10,7 +10,12 @@ * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ + +#ifdef CONFIG_SCHED_STEAL +#define SCHEDSTAT_VERSION 16 +#else #define SCHEDSTAT_VERSION 15 +#endif static int show_schedstat(struct seq_file *seq, void *v) { @@ -37,6 +42,17 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); +#ifdef CONFIG_SCHED_STEAL + seq_printf(seq, " %u %u %u %u %lu %u %u", + rq->found_idle_cpu_easy, + rq->found_idle_cpu_capacity, + rq->found_idle_cpu, + rq->nofound_idle_cpu, + rq->find_time, + rq->steal, + rq->steal_fail); +#endif /* CONFIG_SCHED_STEAL */ + seq_printf(seq, "\n"); #ifdef CONFIG_SMP diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 398035545a6a8c4263d2025f4431cfc322bff4c5..6eee86a21bbcceab4dd601fb8edba8c5d1184b8d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -49,6 +49,25 @@ static inline void update_schedstat_avg(u64 *avg, u64 sample) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #define schedstat_update_avg(var, val) do { update_schedstat_avg(var, val); } while (0) +#ifdef CONFIG_SCHED_STEAL +#define schedstat_start_time() schedstat_val_or_zero(local_clock()) +#define __schedstat_end_time(stat, time) \ + do { \ + unsigned long endtime; \ + \ + if (schedstat_enabled() && (time)) { \ + endtime = local_clock() - (time) - schedstat_skid; \ + schedstat_add((stat), endtime); \ + } \ + } while (0) +#define schedstat_end_time(rq, time) \ + __schedstat_end_time(((rq)->find_time), time) +extern unsigned long schedstat_skid; +#else /* !CONFIG_SCHED_STEAL */ +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) +#endif /* CONFIG_SCHED_STEAL */ + #else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } @@ -63,6 +82,8 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 # define schedstat_update_avg(var, val) do { } while (0) +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_PSI diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 0281da0b3e96351a2f8a7e56b55ad656cbe0199e..e9392b963b30cdfad74e33f5037d1b0ed6fbbea1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,9 @@ * Scheduler topology setup/handling methods */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL +#include "sparsemask.h" +#endif DEFINE_MUTEX(sched_domains_mutex); @@ -10,6 +13,18 @@ DEFINE_MUTEX(sched_domains_mutex); static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +struct s_data; +#ifdef CONFIG_SCHED_STEAL +static int sd_llc_alloc(struct sched_domain *sd); +static void sd_llc_free(struct sched_domain *sd); +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); +static void sd_llc_free_all(const struct cpumask *cpu_map); +#else +static inline void sd_llc_free(struct sched_domain *sd) {} +static inline int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) { return 0; } +static inline void sd_llc_free_all(const struct cpumask *cpu_map) {} +#endif + #ifdef CONFIG_SCHED_DEBUG static int __init sched_debug_setup(char *str) @@ -617,6 +632,10 @@ DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { +#ifdef CONFIG_SCHED_STEAL + struct rq *rq = cpu_rq(cpu); + struct sparsemask *cfs_overload_cpus = NULL; +#endif struct sched_domain_shared *sds = NULL; struct sched_domain *sd; int id = cpu; @@ -627,8 +646,14 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; +#ifdef CONFIG_SCHED_STEAL + cfs_overload_cpus = sds->cfs_overload_cpus; +#endif } +#ifdef CONFIG_SCHED_STEAL + rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); +#endif rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1652,6 +1677,33 @@ static void init_numa_topology_type(void) } } +#ifdef CONFIG_SCHED_STEAL +DEFINE_STATIC_KEY_TRUE(sched_steal_allow); +static int sched_steal_node_limit; +#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 + +static int __init steal_node_limit_setup(char *buf) +{ + get_option(&buf, &sched_steal_node_limit); + return 0; +} + +early_param("sched_steal_node_limit", steal_node_limit_setup); + +static void check_node_limit(void) +{ + int n = num_possible_nodes(); + + if (sched_steal_node_limit == 0) + sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT; + if (n > sched_steal_node_limit) { + static_branch_disable(&sched_steal_allow); + pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); + } +} +#else +static inline void check_node_limit(void) { } +#endif /* CONFIG_SCHED_STEAL */ #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) @@ -1949,6 +2001,80 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +#ifdef CONFIG_SCHED_STEAL +static int sd_llc_alloc(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + struct cpumask *span = sched_domain_span(sd); + int nid = cpu_to_node(cpumask_first(span)); + int flags = __GFP_ZERO | GFP_KERNEL; + struct sparsemask *mask; + + /* + * Allocate the bitmap if not already allocated. This is called for + * every CPU in the LLC but only allocates once per sd_llc_shared. + */ + if (!sds->cfs_overload_cpus) { + mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid); + if (!mask) + return 1; + sds->cfs_overload_cpus = mask; + } + + return 0; +} + +static void sd_llc_free(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + + if (!sds) + return; + + sparsemask_free(sds->cfs_overload_cpus); + sds->cfs_overload_cpus = NULL; +} + +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) +{ + struct sched_domain *sd, *hsd; + int i; + + for_each_cpu(i, cpu_map) { + /* Find highest domain that shares resources */ + hsd = NULL; + for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; + hsd = sd; + } + if (hsd && sd_llc_alloc(hsd)) + return 1; + } + + return 0; +} + +static void sd_llc_free_all(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + struct sched_domain *sd; + struct sd_data *sdd; + int j; + + for_each_sd_topology(tl) { + sdd = &tl->data; + if (!sdd) + continue; + for_each_cpu(j, cpu_map) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd) + sd_llc_free(sd); + } + } +} +#endif + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int dflags, int cpu) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f7150fbeeb55e32e3cddbec102518d47089f195a..73f8a39ec1b894bc68f84ae5a24ea0194b004fc4 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -278,6 +278,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + kvm_arm_pvsched_vcpu_init(&vcpu->arch); + err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); if (err) goto vcpu_uninit; @@ -305,6 +307,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_destroy(vcpu); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); + + if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) + kvm_update_pvsched_preempted(vcpu, 0); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -425,6 +430,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; + if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) + kvm_update_pvsched_preempted(vcpu, 1); + kvm_arm_set_running_vcpu(NULL); } @@ -475,7 +483,9 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); - return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) + bool pv_unhalted = v->arch.pvsched.pv_unhalted; + + return ((irq_lines || kvm_vgic_vcpu_pending_irq(v) || pv_unhalted) && !v->arch.power_off && !v->arch.pause); } diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index 2f5dc7fb437bd70f8726e44b759a31a3a62dc501..9f42f7584b7271f4a0a4f3c5e8f89b3bfcb00bb5 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -15,6 +15,7 @@ #include #include +#include /* * This is an implementation of the Power State Coordination Interface @@ -23,38 +24,6 @@ #define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) -static u32 smccc_get_function(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 0); -} - -static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 1); -} - -static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 2); -} - -static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 3); -} - -static void smccc_set_retval(struct kvm_vcpu *vcpu, - unsigned long a0, - unsigned long a1, - unsigned long a2, - unsigned long a3) -{ - vcpu_set_reg(vcpu, 0, a0); - vcpu_set_reg(vcpu, 1, a1); - vcpu_set_reg(vcpu, 2, a2); - vcpu_set_reg(vcpu, 3, a3); -} - static unsigned long psci_affinity_mask(unsigned long affinity_level) { if (affinity_level <= 3) @@ -373,7 +342,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) * Errors: * -EINVAL: Unrecognized PSCI function */ -static int kvm_psci_call(struct kvm_vcpu *vcpu) +int kvm_psci_call(struct kvm_vcpu *vcpu) { switch (kvm_psci_version(vcpu, vcpu->kvm)) { case KVM_ARM_PSCI_1_0: @@ -387,67 +356,6 @@ static int kvm_psci_call(struct kvm_vcpu *vcpu) }; } -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) -{ - u32 func_id = smccc_get_function(vcpu); - u32 val = SMCCC_RET_NOT_SUPPORTED; - u32 feature; - - switch (func_id) { - case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; - break; - case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: - feature = smccc_get_arg1(vcpu); - switch(feature) { - case ARM_SMCCC_ARCH_WORKAROUND_1: - switch (kvm_arm_harden_branch_predictor()) { - case KVM_BP_HARDEN_UNKNOWN: - break; - case KVM_BP_HARDEN_WA_NEEDED: - val = SMCCC_RET_SUCCESS; - break; - case KVM_BP_HARDEN_NOT_REQUIRED: - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; - break; - } - break; - case ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - case KVM_SSBD_UNKNOWN: - break; - case KVM_SSBD_KERNEL: - val = SMCCC_RET_SUCCESS; - break; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: - val = SMCCC_RET_NOT_REQUIRED; - break; - } - break; - case ARM_SMCCC_ARCH_WORKAROUND_3: - switch (kvm_arm_get_spectre_bhb_state()) { - case SPECTRE_VULNERABLE: - break; - case SPECTRE_MITIGATED: - val = SMCCC_RET_SUCCESS; - break; - case SPECTRE_UNAFFECTED: - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; - break; - } - break; - } - break; - default: - return kvm_psci_call(vcpu); - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return 1; -} - int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) { return 4; /* PSCI version and three workaround registers */