diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 7481b12939e62a7cb3bff111a0a4b35c519ec010..3f9be66edece967c66e0ca25da824174d078030e 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1228,12 +1228,15 @@ CONFIG_ETMEM_SCAN=m CONFIG_ETMEM_SWAP=m CONFIG_ETMEM=y # CONFIG_BPF_READAHEAD is not set +CONFIG_MEM_SAMPLING=y +CONFIG_NUMABALANCING_MEM_SAMPLING=y # # Data Access Monitoring # CONFIG_DAMON=y CONFIG_DAMON_VADDR=y +CONFIG_DAMON_MEM_SAMPLING=y CONFIG_DAMON_PADDR=y CONFIG_DAMON_SYSFS=y # CONFIG_DAMON_DBGFS is not set @@ -6969,6 +6972,7 @@ CONFIG_CPU_INSPECTOR_ATF=m CONFIG_ROH=m CONFIG_ROH_HNS=m +CONFIG_ARM_SPE_MEM_SAMPLING=y # end of Device Drivers # diff --git a/drivers/Kconfig b/drivers/Kconfig index 3be1197d872c1d64796f3d454e768b9f0e01f899..2b65435015d7fc2b2299ff10b14707fd839164a3 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -251,4 +251,6 @@ source "drivers/roh/Kconfig" source "drivers/coda/Kconfig" +source "drivers/arm/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 3955e605df14ffb72cce5203c1d1b60df19d75ee..79d803250002b9459f25e3474a768fd5da2c94c9 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -205,3 +205,5 @@ obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_ROH) += roh/ obj-$(CONFIG_HISI_VIRTCCA_CODA) += coda/ + +obj-$(CONFIG_ARM_SPE_MEM_SAMPLING) += arm/mm_monitor/ diff --git a/drivers/arm/Kconfig b/drivers/arm/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..d3291f0d5d57793689b233eff4afb3aa3ae48046 --- /dev/null +++ b/drivers/arm/Kconfig @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +source "drivers/arm/mm_monitor/Kconfig" diff --git a/drivers/arm/mm_monitor/Kconfig b/drivers/arm/mm_monitor/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..417b403ecffcf1400794cfd134096a4ea21cba15 --- /dev/null +++ b/drivers/arm/mm_monitor/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# arm spe dirver +# +config ARM_SPE_MEM_SAMPLING + bool "In-kernel SPE for driver for page access profiling" + depends on ARM_SPE_PMU + default n + help + Enable support for the ARMv8.2 Statistical Profiling Extension (SPE), + which provides periodic sampling of memory accesses and operations + in the CPU pipeline. This extension allows the driver to monitor + memory access patterns, which can help with performance tuning, + debugging, and analyzing memory-related bottlenecks. + + This feature is only available on ARM64 architecture and will fall + back to the native software sampling mechanism if the ARM SPE PMU + (Performance Monitoring Unit) is in use. When enabled, this + configuration will activate the in-kernel driver to collect profiling + data on page-level memory accesses. diff --git a/drivers/arm/mm_monitor/Makefile b/drivers/arm/mm_monitor/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9b0b1f18a529b7e6ed7fab13944847316befb807 --- /dev/null +++ b/drivers/arm/mm_monitor/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ARM_SPE_MEM_SAMPLING) += mm_spe.o spe-decoder/arm-spe-decoder.o spe-decoder/arm-spe-pkt-decoder.o diff --git a/drivers/arm/mm_monitor/mm_spe.c b/drivers/arm/mm_monitor/mm_spe.c new file mode 100644 index 0000000000000000000000000000000000000000..0eaa7e7397e1d392287801e1704d5ce998438e51 --- /dev/null +++ b/drivers/arm/mm_monitor/mm_spe.c @@ -0,0 +1,537 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm_spe.c: Arm Statistical Profiling Extensions support + * Copyright (c) 2019-2020, Arm Ltd. + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ + +#define PMUNAME "mm_spe" +#define DRVNAME PMUNAME "_driver" +#define pr_fmt(fmt) DRVNAME ": " fmt + +#include +#include +#include +#include + +#include "spe-decoder/arm-spe-decoder.h" +#include "spe-decoder/arm-spe-pkt-decoder.h" +#include "mm_spe.h" +static bool spe_boost_enable; + +static struct mm_spe *spe; + +#define SPE_INIT_FAIL 0 +#define SPE_INIT_READY 1 +#define SPE_INIT_SUCC 2 +static int spe_probe_status = SPE_INIT_FAIL; + +#define SPE_PMU_FEAT_FILT_EVT (1UL << 0) +#define SPE_PMU_FEAT_FILT_TYP (1UL << 1) +#define SPE_PMU_FEAT_FILT_LAT (1UL << 2) +#define SPE_PMU_FEAT_ARCH_INST (1UL << 3) +#define SPE_PMU_FEAT_LDS (1UL << 4) +#define SPE_PMU_FEAT_ERND (1UL << 5) +#define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) +#define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) + +DEFINE_PER_CPU(struct mm_spe_buf, per_cpu_spe_buf); + +int mm_spe_percpu_buffer_alloc(int cpu) +{ + struct mm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu); + void *alloc_base; + + if (spe_buf->base && spe_buf->record_base) + return 0; + + /* alloc spe raw data buffer */ + alloc_base = kzalloc_node(SPE_BUFFER_MAX_SIZE, GFP_KERNEL, cpu_to_node(cpu)); + if (unlikely(!alloc_base)) { + pr_err("alloc spe raw data buffer failed.\n"); + return -ENOMEM; + } + + spe_buf->base = alloc_base; + + spe_buf->size = SPE_BUFFER_SIZE; + spe_buf->cur = alloc_base + SPE_BUFFER_MAX_SIZE - SPE_BUFFER_SIZE; + spe_buf->period = SPE_SAMPLE_PERIOD; + + /* alloc record buffer */ + spe_buf->record_size = SPE_RECORD_ENTRY_SIZE * SPE_RECORD_BUFFER_MAX_RECORDS; + spe_buf->record_base = kzalloc_node(spe_buf->record_size, GFP_KERNEL, cpu_to_node(cpu)); + if (unlikely(!spe_buf->record_base)) { + kfree(alloc_base); + pr_err("alloc spe record buffer failed.\n"); + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL_GPL(mm_spe_percpu_buffer_alloc); + +int mm_spe_buffer_alloc(void) +{ + int cpu, ret = 0; + cpumask_t *mask = &spe->supported_cpus; + + for_each_possible_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + ret = mm_spe_percpu_buffer_alloc(cpu); + if (ret) + return ret; + } + return ret; +} +EXPORT_SYMBOL_GPL(mm_spe_buffer_alloc); + +void mm_spe_percpu_buffer_free(int cpu) +{ + struct mm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu); + + if (!spe_buf->base) + return; + + kfree(spe_buf->base); + spe_buf->cur = NULL; + spe_buf->base = NULL; + spe_buf->size = 0; + + kfree(spe_buf->record_base); + spe_buf->record_base = NULL; + spe_buf->record_size = 0; +} +EXPORT_SYMBOL_GPL(mm_spe_percpu_buffer_free); + +void mm_spe_buffer_free(void) +{ + cpumask_t *mask = &spe->supported_cpus; + int cpu; + + for_each_possible_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + mm_spe_percpu_buffer_free(cpu); + } + spe_probe_status -= 1; + set_mem_sampling_state(false); +} +EXPORT_SYMBOL_GPL(mm_spe_buffer_free); + +static void mm_spe_buffer_init(void) +{ + u64 base, limit; + struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + if (!spe_buf || !spe_buf->cur || !spe_buf->size) { + /* + * We still need to clear the limit pointer, since the + * profiler might only be disabled by virtue of a fault. + */ + limit = 0; + goto out_write_limit; + } + + base = (u64)spe_buf->cur; + limit = ((u64)spe_buf->cur + spe_buf->size) | PMBLIMITR_EL1_E; + write_sysreg_s(base, SYS_PMBPTR_EL1); + +out_write_limit: + write_sysreg_s(limit, SYS_PMBLIMITR_EL1); +} + +void mm_spe_add_probe_status(void) +{ + spe_probe_status += 1; +} +EXPORT_SYMBOL_GPL(mm_spe_add_probe_status); + +static void mm_spe_disable_and_drain_local(void) +{ + /* Disable profiling at EL0 and EL1 */ + write_sysreg_s(0, SYS_PMSCR_EL1); + isb(); + + /* Drain any buffered data */ + psb_csync(); + dsb(nsh); + + /* Disable the profiling buffer */ + write_sysreg_s(0, SYS_PMBLIMITR_EL1); + isb(); + + /* Disable boost_spe profiling */ + if (spe->support_boost_spe && spe_boost_enable) { + write_sysreg_s(0, SYS_OMHTPG_EL1); + isb(); + } +} + +static u64 mm_spe_to_pmsfcr(void) +{ + u64 reg = 0; + + if (spe->load_filter) + reg |= PMSFCR_EL1_LD; + + if (spe->store_filter) + reg |= PMSFCR_EL1_ST; + + if (spe->branch_filter) + reg |= PMSFCR_EL1_B; + + if (reg) + reg |= PMSFCR_EL1_FT; + + if (spe->event_filter) + reg |= PMSFCR_EL1_FE; + + if (spe->inv_event_filter) + reg |= PMSFCR_EL1_FnE; + + if (spe->min_latency) + reg |= PMSFCR_EL1_FL; + + return reg; +} + +static u64 arm_spe_to_htpg(void) +{ + u64 reg = 0; + struct boost_spe_contol *boost_spe = &spe->boost_spe; + + if (boost_spe->rmt_acc_en) + reg |= SYS_OMHTPG_EL1_RMEN; + + if (boost_spe->boost_spe_en_cfg < 0x4) + reg |= boost_spe->boost_spe_en_cfg; + + if (boost_spe->record_sel) + reg |= SYS_OMHTPG_EL1_REC_SEL; + + if (boost_spe->pop_uop_sel) + reg |= SYS_OMHTPG_EL1_POP_UOP_SEL; + + if (boost_spe->sft_cfg < 0x4) + reg |= boost_spe->sft_cfg << SYS_OMHTPG_EL1_SFT_CFG_SHIFT; + + if (boost_spe->boost_spe_pa_flt_en || boost_spe->rmt_acc_pa_flt_en) { + reg |= 1 < SYS_OMHTPG_EL1_PAEN_SHIFT; + reg |= 1 < SYS_OMHTPG_EL1_RMPAFLEN_SHIFT; + + if (boost_spe->pa_flt_pt < 0x8000000 && boost_spe->pa_flt_mask < 0x8000000) { + reg |= boost_spe->pa_flt_pt << SYS_OMHTPG_EL1_PAFL_SHIFT; + reg |= boost_spe->pa_flt_mask << SYS_OMHTPG_EL1_PAFLMK_SHIFT; + } + } + + return reg; +} + +static u64 mm_spe_to_pmsevfr(void) +{ + return spe->event_filter; +} + +static u64 mm_spe_to_pmsnevfr(void) +{ + return spe->inv_event_filter; +} + +static u64 mm_spe_to_pmslatfr(void) +{ + return spe->min_latency; +} + +static void mm_spe_sanitise_period(struct mm_spe_buf *spe_buf) +{ + u64 period = spe_buf->period; + u64 max_period = PMSIRR_EL1_INTERVAL_MASK; + + if (period < spe->min_period) + period = spe->min_period; + else if (period > max_period) + period = max_period; + else + period &= max_period; + + spe_buf->period = period; +} + +static u64 mm_spe_to_pmsirr(void) +{ + u64 reg = 0; + struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + mm_spe_sanitise_period(spe_buf); + + if (spe->jitter) + reg |= 0x1; + + reg |= spe_buf->period << 8; + + return reg; +} + +static u64 mm_spe_to_pmscr(void) +{ + u64 reg = 0; + + if (spe->ts_enable) + reg |= PMSCR_EL1_TS; + + if (spe->pa_enable) + reg |= PMSCR_EL1_PA; + + if (spe->pct_enable < 0x4) + reg |= spe->pct_enable << 6; + + if (spe->exclude_user) + reg |= PMSCR_EL1_E0SPE; + + if (spe->exclude_kernel) + reg |= PMSCR_EL1_E1SPE; + + if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR)) + reg |= PMSCR_EL1_CX; + + return reg; +} + +int mm_spe_start(void) +{ + u64 reg; + int cpu = smp_processor_id(); + + if (!cpumask_test_cpu(cpu, &spe->supported_cpus)) + return -ENOENT; + + mm_spe_buffer_init(); + + reg = mm_spe_to_pmsfcr(); + write_sysreg_s(reg, SYS_PMSFCR_EL1); + + reg = mm_spe_to_pmsevfr(); + write_sysreg_s(reg, SYS_PMSEVFR_EL1); + + if (spe->features & SPE_PMU_FEAT_INV_FILT_EVT) { + reg = mm_spe_to_pmsnevfr(); + write_sysreg_s(reg, SYS_PMSNEVFR_EL1); + } + + reg = mm_spe_to_pmslatfr(); + + write_sysreg_s(reg, SYS_PMSLATFR_EL1); + + reg = mm_spe_to_pmsirr(); + write_sysreg_s(reg, SYS_PMSIRR_EL1); + isb(); + + reg = mm_spe_to_pmscr(); + isb(); + write_sysreg_s(reg, SYS_PMSCR_EL1); + + if (spe->support_boost_spe) { + reg = arm_spe_to_htpg(); + isb(); + write_sysreg_s(reg, SYS_OMHTPG_EL1); + } + + return 0; +} + +void mm_spe_continue(void) +{ + int reg; + + mm_spe_buffer_init(); + + reg = mm_spe_to_pmscr(); + + isb(); + write_sysreg_s(reg, SYS_PMSCR_EL1); +} + +void mm_spe_stop(void) +{ + mm_spe_disable_and_drain_local(); +} + +void mm_spe_decoding(void) +{ + struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + spe_buf->nr_records = 0; + arm_spe_decode_buf(spe_buf->cur, spe_buf->size); +} + +struct mm_spe_buf *mm_spe_getbuf_addr(void) +{ + struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + return spe_buf; +} + +int mm_spe_getnum_record(void) +{ + struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + return spe_buf->nr_records; +} + +struct mm_spe *mm_spe_get_desc(void) +{ + return spe; +} +EXPORT_SYMBOL_GPL(mm_spe_get_desc); + +int mm_spe_enabled(void) +{ + return spe_probe_status == SPE_INIT_SUCC; +} + +static const struct of_device_id mm_spe_sample_para_init_tb[] = { + { .compatible = "arm,statistical-profiling-extension-v1", + .data = (void *)1 }, + { /* Sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, mm_spe_sample_para_init_tb); + +static const struct platform_device_id mm_spe_match[] = { + { ARMV8_SPE_MEM_SAMPLING_PDEV_NAME, 0 }, + {} +}; +MODULE_DEVICE_TABLE(platform, mm_spe_match); + +static void arm_spe_boost_spe_para_init(void) +{ + struct boost_spe_contol *boost_spe = &spe->boost_spe; + + boost_spe->record_sel = 1; + boost_spe->pop_uop_sel = 0; + boost_spe->rmt_acc_pa_flt_en = 0; + boost_spe->rmt_acc_en = 1; + boost_spe->boost_spe_pa_flt_en = 0; + boost_spe->pa_flt_pt = 0; + boost_spe->pa_flt_mask = 0; + boost_spe->sft_cfg = 0; + boost_spe->boost_spe_en_cfg = 0x3; +} + +static void mm_spe_sample_para_init(void) +{ + u64 implementor = read_cpuid_implementor(); + u64 part_num = read_cpuid_part_number(); + + /* Is support boost_spe sampling? */ + if (implementor == ARM_CPU_IMP_HISI && part_num == 0xd06) + spe->support_boost_spe = true; + + spe->sample_period = SPE_SAMPLE_PERIOD; + spe->jitter = 1; + spe->load_filter = 1; + spe->store_filter = 1; + spe->branch_filter = 0; + spe->inv_event_filter = 0; + spe->event_filter = 0x2; + + spe->ts_enable = 0; + spe->pa_enable = 1; + spe->pct_enable = 0; + + spe->exclude_user = 1; + spe->exclude_kernel = 0; + + spe->min_latency = 120; + + if (spe->support_boost_spe) + arm_spe_boost_spe_para_init(); +} + +void mm_spe_record_enqueue(struct arm_spe_record *record) +{ + struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + struct mem_sampling_record *record_tail; + + if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) { + pr_err("nr_records exceeded!\n"); + return; + } + + if (record->boost_spe_idx) + trace_spe_boost_spe_record((struct mem_sampling_record *)record); + trace_mm_spe_record((struct mem_sampling_record *)record); + record_tail = spe_buf->record_base + + spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE; + *record_tail = *(struct mem_sampling_record *)record; + spe_buf->nr_records++; +} + +static int mm_spe_device_probe(struct platform_device *pdev) +{ + + struct device *dev; + + /* + * If kernelspace is unmapped when running at EL0, then the SPE + * buffer will fault and prematurely terminate the AUX session. + */ + if (arm64_kernel_unmapped_at_el0()) { + dev_warn_once(dev, "buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n"); + return -EPERM; + } + + if (!pdev) { + pr_err("pdev is NULL!\n"); + return -ENODEV; + } + + dev = &pdev->dev; + if (!dev) { + pr_err("dev is NULL!\n"); + return -ENODEV; + } + + spe = devm_kzalloc(dev, sizeof(*spe), GFP_KERNEL); + if (!spe) + return -ENOMEM; + + spe->pdev = pdev; + platform_set_drvdata(pdev, spe); + + mm_spe_sample_para_init(); + + mm_spe_add_probe_status(); + return 0; + +} + +static struct platform_driver mm_spe_driver = { + .id_table = mm_spe_match, + .driver = { + .name = DRVNAME, + .of_match_table = of_match_ptr(mm_spe_sample_para_init_tb), + .suppress_bind_attrs = true, + }, + .probe = mm_spe_device_probe, +}; + +static __init int enable_spe_boost(char *str) +{ + spe_boost_enable = true; + return 0; +} +early_param("enable_spe_boost", enable_spe_boost); + +static int __init mm_spe_init(void) +{ + return platform_driver_register(&mm_spe_driver); +} + +static void __exit arm_spe_exit(void) +{ + platform_driver_unregister(&mm_spe_driver); +} + +subsys_initcall(mm_spe_init); diff --git a/drivers/arm/mm_monitor/mm_spe.h b/drivers/arm/mm_monitor/mm_spe.h new file mode 100644 index 0000000000000000000000000000000000000000..5ffc11cb951a1a784291bf1f03be494edbd8f4fa --- /dev/null +++ b/drivers/arm/mm_monitor/mm_spe.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SPE_H +#define __SPE_H + +#define SPE_BUFFER_MAX_SIZE (PAGE_SIZE) +#define SPE_BUFFER_SIZE (PAGE_SIZE / 16) + +#define SPE_SAMPLE_PERIOD 1024 + +#define SPE_RECORD_BUFFER_MAX_RECORDS (100) +#define SPE_RECORD_ENTRY_SIZE sizeof(struct mem_sampling_record) +#define ARMV8_SPE_MEM_SAMPLING_PDEV_NAME "arm,mm_spe,spe-v1" + +/* boost_spe sampling controls */ +#define SYS_OMHTPG_EL1 sys_reg(3, 0, 15, 8, 2) +#define SYS_OMHTPG_EL1_RMCF_SHIFT 0 +#define SYS_OMHTPG_EL1_RMCF_MASK 0x3UL +#define SYS_OMHTPG_EL1_RMEN GENMASK(2, 2) +#define SYS_OMHTPG_EL1_RMEN_SHIFT 2 +#define SYS_OMHTPG_EL1_PAFL GENMASK(3, 3) +#define SYS_OMHTPG_EL1_PAFL_SHIFT 3 +#define SYS_OMHTPG_EL1_PAFL_MASK 0x7FFFFFFUL +#define SYS_OMHTPG_EL1_PAFLMK_SHIFT 30 +#define SYS_OMHTPG_EL1_PAFLMK_MASK 0x7FFFFFFUL +#define SYS_OMHTPG_EL1_PAEN_SHIFT 57 + +#define SYS_OMHTPG_EL1_RMPAFLEN_SHIFT 58 +#define SYS_OMHTPG_EL1_POP_UOP_SEL GENMASK(59, 59) +#define SYS_OMHTPG_EL1_SFT_CFG_SHIFT 60 +#define SYS_OMHTPG_EL1_SFT_CFG_MASK 0x3UL +#define SYS_OMHTPG_EL1_REC_SEL GENMASK(62, 62) + +struct boost_spe_contol { + u32 boost_spe_en_cfg; + u32 pa_flt_pt; + u32 pa_flt_mask; + u64 sft_cfg; + bool boost_spe_pa_flt_en; + bool rmt_acc_en; + bool rmt_acc_pa_flt_en; + bool pop_uop_sel; + bool record_sel; +}; + +struct mm_spe { + struct pmu pmu; + struct platform_device *pdev; + cpumask_t supported_cpus; + struct hlist_node hotplug_node; + struct boost_spe_contol boost_spe; + int irq; /* PPI */ + u16 pmsver; + u16 min_period; + u16 counter_sz; + u64 features; + u16 max_record_sz; + u16 align; + u64 sample_period; + local64_t period_left; + bool jitter; + bool load_filter; + bool store_filter; + bool branch_filter; + u64 inv_event_filter; + u16 min_latency; + u64 event_filter; + bool ts_enable; + bool pa_enable; + u8 pct_enable; + bool exclude_user; + bool exclude_kernel; + bool support_boost_spe; +}; + +struct mm_spe_buf { + void *cur; /* for spe raw data buffer */ + int size; + int period; + void *base; + + void *record_base; /* for spe record buffer */ + int record_size; + int nr_records; +}; + +#ifdef CONFIG_ARM_SPE_MEM_SAMPLING +void mm_spe_add_probe_status(void); +int mm_spe_percpu_buffer_alloc(int cpu); +int mm_spe_buffer_alloc(void); +void mm_spe_percpu_buffer_free(int cpu); +void mm_spe_buffer_free(void); +struct mm_spe *mm_spe_get_desc(void); +#else +static inline void mm_spe_add_probe_status(void) { } +static inline int mm_spe_percpu_buffer_alloc(int cpu) { return 0; } +static inline int mm_spe_buffer_alloc(void) { return 0; } +static inline void mm_spe_percpu_buffer_free(int cpu) { } +static inline void mm_spe_buffer_free(void) { } +static inline struct mm_spe *mm_spe_get_desc(void) { return NULL; } +#endif +#endif /* __SPE_H */ diff --git a/drivers/arm/mm_monitor/spe-decoder/Makefile b/drivers/arm/mm_monitor/spe-decoder/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4fdae5d381867542ad12a7a7d34aabfdd141e40b --- /dev/null +++ b/drivers/arm/mm_monitor/spe-decoder/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y := arm-spe-decoder.o arm-spe-pkt-decoder.o diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.c b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.c new file mode 100644 index 0000000000000000000000000000000000000000..1394d377c061946de96f469415677c0a32af88b7 --- /dev/null +++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arm_spe_decoder.c: ARM SPE support + * Copyright (c) 2017-2018, Arm Ltd. + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "arm-spe-decoder.h" + +static u64 arm_spe_calc_ip(int index, u64 payload) +{ + u64 ns, el, val; + u32 seen_idx; + + /* Instruction virtual address or Branch target address */ + if (index == SPE_ADDR_PKT_HDR_INDEX_INS || + index == SPE_ADDR_PKT_HDR_INDEX_BRANCH) { + ns = SPE_ADDR_PKT_GET_NS(payload); + el = SPE_ADDR_PKT_GET_EL(payload); + + /* Clean highest byte */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + + /* Fill highest byte for EL1 or EL2 (VHE) mode */ + if (ns && (el == SPE_ADDR_PKT_EL1 || el == SPE_ADDR_PKT_EL2)) + payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT; + + /* Data access virtual address */ + } else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT) { + + /* Clean tags */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + + /* + * Armv8 ARM (ARM DDI 0487F.c), chapter "D10.2.1 Address packet" + * defines the data virtual address payload format, the top byte + * (bits [63:56]) is assigned as top-byte tag; so we only can + * retrieve address value from bits [55:0]. + * + * According to Documentation/arm64/memory.rst, if detects the + * specific pattern in bits [55:52] of payload which falls in + * the kernel space, should fixup the top byte and this allows + * perf tool to parse DSO symbol for data address correctly. + * + * For this reason, if detects the bits [55:52] is 0xf, will + * fill 0xff into the top byte. + */ + val = SPE_ADDR_PKT_ADDR_GET_BYTE_6(payload); + if ((val & 0xf0ULL) == 0xf0ULL) + payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT; + + /* Data access physical address */ + } else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) { + /* Clean highest byte */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + /* Boost_spe hot data access physical address */ + } else if (index == SPE_ADDR_PKT_HDR_INDEX_BOOST_SPE_DATA_PHYS) { + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_BOOST_SPE(payload); + /* Remote Data access physical address */ + } else if (index == SPE_ADDR_PKT_HDR_INDEX_REMOTE_DATA_PHYS) { + /* Clean highest byte */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + } else { + seen_idx = 0; + if (!(seen_idx & BIT(index))) { + seen_idx |= BIT(index); + pr_warn("ignoring unsupported address packet index: 0x%x\n", index); + } + } + + return payload; +} + +void arm_spe_decoder_free(struct arm_spe_decoder *decoder) +{ + kfree(decoder); +} + +static int arm_spe_get_next_packet(struct arm_spe_decoder *decoder) +{ + int ret; + + do { + if (!decoder->len) + return 0; + + ret = arm_spe_get_packet(decoder->buf, decoder->len, + &decoder->packet); + if (ret <= 0) { + /* Move forward for 1 byte */ + decoder->buf += 1; + decoder->len -= 1; + return -EBADMSG; + } + + decoder->buf += ret; + decoder->len -= ret; + } while (decoder->packet.type == ARM_SPE_PAD); + return 1; +} + +static int arm_spe_read_record(struct arm_spe_decoder *decoder) +{ + int err; + int idx; + u64 payload, ip; + + memset(&decoder->record, 0x0, sizeof(decoder->record)); + decoder->record.context_id = (u64)-1; + while (1) { + err = arm_spe_get_next_packet(decoder); + if (err <= 0) + return err; + + idx = decoder->packet.index; + payload = decoder->packet.payload; + + switch (decoder->packet.type) { + case ARM_SPE_TIMESTAMP: + decoder->record.timestamp = payload; + return 1; + case ARM_SPE_END: + return 1; + case ARM_SPE_ADDRESS: + ip = arm_spe_calc_ip(idx, payload); + if (idx == SPE_ADDR_PKT_HDR_INDEX_INS) + decoder->record.from_ip = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_BRANCH) + decoder->record.to_ip = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT) + decoder->record.virt_addr = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) + decoder->record.phys_addr = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_BOOST_SPE_DATA_PHYS) + decoder->record.boost_spe_addr[decoder->record.boost_spe_idx++] + = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_REMOTE_DATA_PHYS) + decoder->record.remote_addr = ip; + break; + case ARM_SPE_COUNTER: + if (idx == SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT) + decoder->record.latency = payload; + break; + case ARM_SPE_CONTEXT: + decoder->record.context_id = payload; + break; + case ARM_SPE_OP_TYPE: + if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) { + if (payload & 0x1) + decoder->record.op = ARM_SPE_ST; + else + decoder->record.op = ARM_SPE_LD; + } + break; + case ARM_SPE_EVENTS: + if (payload & BIT(EV_L1D_REFILL)) + decoder->record.type |= ARM_SPE_L1D_MISS; + + if (payload & BIT(EV_L1D_ACCESS)) + decoder->record.type |= ARM_SPE_L1D_ACCESS; + + if (payload & BIT(EV_TLB_WALK)) + decoder->record.type |= ARM_SPE_TLB_MISS; + + if (payload & BIT(EV_TLB_ACCESS)) + decoder->record.type |= ARM_SPE_TLB_ACCESS; + + if (payload & BIT(EV_LLC_MISS)) + decoder->record.type |= ARM_SPE_LLC_MISS; + + if (payload & BIT(EV_LLC_ACCESS)) + decoder->record.type |= ARM_SPE_LLC_ACCESS; + + if (payload & BIT(EV_REMOTE_ACCESS)) + decoder->record.type |= ARM_SPE_REMOTE_ACCESS; + + if (payload & BIT(EV_MISPRED)) + decoder->record.type |= ARM_SPE_BRANCH_MISS; + + break; + case ARM_SPE_DATA_SOURCE: + decoder->record.source = payload; + break; + case ARM_SPE_BAD: + break; + case ARM_SPE_PAD: + break; + default: + pr_err("Get packet error!\n"); + return -1; + } + } + return 0; +} + +static bool arm_spe_decode(struct arm_spe_decoder *decoder) +{ + if (decoder->len) { + if (arm_spe_read_record(decoder) == 1) + return true; + } + return false; +} + +void arm_spe_decode_buf(const unsigned char *buf, size_t len) +{ + struct arm_spe_decoder decoder; + + decoder.buf = buf; + decoder.len = len; + + while (arm_spe_decode(&decoder)) + mm_spe_record_enqueue(&(decoder.record)); + +} +EXPORT_SYMBOL(arm_spe_decode_buf); diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.h b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..3ccc32de8afc4c2b6f4fde294e828ca1c44a0b40 --- /dev/null +++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * arm_spe_decoder.h: Arm Statistical Profiling Extensions support + * Copyright (c) 2019-2020, Arm Ltd. + */ + +#ifndef INCLUDE__ARM_SPE_DECODER_H__ +#define INCLUDE__ARM_SPE_DECODER_H__ + +#include + +#include "arm-spe-pkt-decoder.h" + +enum arm_spe_sample_type { + ARM_SPE_L1D_ACCESS = 1 << 0, + ARM_SPE_L1D_MISS = 1 << 1, + ARM_SPE_LLC_ACCESS = 1 << 2, + ARM_SPE_LLC_MISS = 1 << 3, + ARM_SPE_TLB_ACCESS = 1 << 4, + ARM_SPE_TLB_MISS = 1 << 5, + ARM_SPE_BRANCH_MISS = 1 << 6, + ARM_SPE_REMOTE_ACCESS = 1 << 7, +}; + +enum arm_spe_op_type { + ARM_SPE_LD = 1 << 0, + ARM_SPE_ST = 1 << 1, +}; + +enum arm_spe_neoverse_data_source { + ARM_SPE_NV_L1D = 0x0, + ARM_SPE_NV_L2 = 0x8, + ARM_SPE_NV_PEER_CORE = 0x9, + ARM_SPE_NV_LOCAL_CLUSTER = 0xa, + ARM_SPE_NV_SYS_CACHE = 0xb, + ARM_SPE_NV_PEER_CLUSTER = 0xc, + ARM_SPE_NV_REMOTE = 0xd, + ARM_SPE_NV_DRAM = 0xe, +}; + +struct arm_spe_record { + enum arm_spe_sample_type type; + int err; + u32 op; + u32 latency; + u64 from_ip; + u64 to_ip; + u64 timestamp; + u64 virt_addr; + u64 phys_addr; + u64 context_id; + u64 boost_spe_addr[8]; + u64 remote_addr; + u16 boost_spe_idx; + u16 source; +}; + +struct arm_spe_buffer { + const unsigned char *buf; + size_t len; + u64 offset; + u64 trace_nr; +}; + +struct arm_spe_decoder { + struct arm_spe_record record; + const unsigned char *buf; + size_t len; + struct arm_spe_pkt packet; +}; + +void arm_spe_decoder_free(struct arm_spe_decoder *decoder); +void arm_spe_decode_buf(const unsigned char *buf, size_t len); +void mm_spe_record_enqueue(struct arm_spe_record *record); +#endif diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.c b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.c new file mode 100644 index 0000000000000000000000000000000000000000..aeec434487798475c7899cce9f91e8fb0a6f272e --- /dev/null +++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Arm Statistical Profiling Extensions (SPE) support + * Copyright (c) 2017-2018, Arm Ltd. + */ + +#include +#include +#include +#include +#include + +#include "arm-spe-pkt-decoder.h" + +/* + * Extracts the field "sz" from header bits and converts to bytes: + * 00 : byte (1) + * 01 : halfword (2) + * 10 : word (4) + * 11 : doubleword (8) + */ +static unsigned int arm_spe_payload_len(unsigned char hdr) +{ + return 1U << ((hdr & GENMASK_ULL(5, 4)) >> 4); +} + +static int arm_spe_get_payload(const unsigned char *buf, size_t len, + unsigned char ext_hdr, + struct arm_spe_pkt *packet) +{ + size_t payload_len = arm_spe_payload_len(buf[ext_hdr]); + + if (len < 1 + ext_hdr + payload_len) + return ARM_SPE_NEED_MORE_BYTES; + + buf += 1 + ext_hdr; + + switch (payload_len) { + case 1: + packet->payload = *(uint8_t *)buf; + break; + case 2: + packet->payload = le16_to_cpu(*(uint16_t *)buf); + break; + case 4: + packet->payload = le32_to_cpu(*(uint32_t *)buf); + break; + case 8: + packet->payload = le64_to_cpu(*(uint64_t *)buf); + break; + default: + return ARM_SPE_BAD_PACKET; + } + + return 1 + ext_hdr + payload_len; +} + +static int arm_spe_get_pad(struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_PAD; + return 1; +} + +static int arm_spe_get_alignment(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + unsigned int alignment = 1 << ((buf[0] & 0xf) + 1); + + if (len < alignment) + return ARM_SPE_NEED_MORE_BYTES; + + packet->type = ARM_SPE_PAD; + return alignment - (((uintptr_t)buf) & (alignment - 1)); +} + +static int arm_spe_get_end(struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_END; + return 1; +} + +static int arm_spe_get_timestamp(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_TIMESTAMP; + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_events(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_EVENTS; + + /* we use index to identify Events with a less number of + * comparisons in arm_spe_pkt_desc(): E.g., the LLC-ACCESS, + * LLC-REFILL, and REMOTE-ACCESS events are identified if + * index > 1. + */ + packet->index = arm_spe_payload_len(buf[0]); + + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_data_source(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_DATA_SOURCE; + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_context(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_CONTEXT; + packet->index = SPE_CTX_PKT_HDR_INDEX(buf[0]); + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_op_type(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_OP_TYPE; + packet->index = SPE_OP_PKT_HDR_CLASS(buf[0]); + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_counter(const unsigned char *buf, size_t len, + const unsigned char ext_hdr, struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_COUNTER; + + if (ext_hdr) + packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]); + else + packet->index = SPE_HDR_SHORT_INDEX(buf[0]); + + return arm_spe_get_payload(buf, len, ext_hdr, packet); +} + +static int arm_spe_get_addr(const unsigned char *buf, size_t len, + const unsigned char ext_hdr, struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_ADDRESS; + + if (ext_hdr) + packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]); + else + packet->index = SPE_HDR_SHORT_INDEX(buf[0]); + + return arm_spe_get_payload(buf, len, ext_hdr, packet); +} + +static int arm_spe_do_get_packet(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + unsigned int hdr; + unsigned char ext_hdr = 0; + + memset(packet, 0, sizeof(struct arm_spe_pkt)); + + if (!len) + return ARM_SPE_NEED_MORE_BYTES; + + hdr = buf[0]; + + if (hdr == SPE_HEADER0_PAD) + return arm_spe_get_pad(packet); + + if (hdr == SPE_HEADER0_END) /* no timestamp at end of record */ + return arm_spe_get_end(packet); + + if (hdr == SPE_HEADER0_TIMESTAMP) + return arm_spe_get_timestamp(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_EVENTS) + return arm_spe_get_events(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_SOURCE) + return arm_spe_get_data_source(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_CONTEXT) + return arm_spe_get_context(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_OP_TYPE) + return arm_spe_get_op_type(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_EXTENDED) { + /* 16-bit extended format header */ + if (len == 1) + return ARM_SPE_BAD_PACKET; + + ext_hdr = 1; + hdr = buf[1]; + if (hdr == SPE_HEADER1_ALIGNMENT) + return arm_spe_get_alignment(buf, len, packet); + } + + /* + * The short format header's byte 0 or the extended format header's + * byte 1 has been assigned to 'hdr', which uses the same encoding for + * address packet and counter packet, so don't need to distinguish if + * it's short format or extended format and handle in once. + */ + if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_ADDRESS) + return arm_spe_get_addr(buf, len, ext_hdr, packet); + + if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_COUNTER) + return arm_spe_get_counter(buf, len, ext_hdr, packet); + + return ARM_SPE_BAD_PACKET; +} + +int arm_spe_get_packet(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + int ret; + + ret = arm_spe_do_get_packet(buf, len, packet); + /* put multiple consecutive PADs on the same line, up to + * the fixed-width output format of 16 bytes per line. + */ + if (ret > 0 && packet->type == ARM_SPE_PAD) { + while (ret < 16 && len > (size_t)ret && !buf[ret]) + ret += 1; + } + return ret; +} diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.h b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..873c3590e4a8769ab587cfe7580f8a3d3e69e6b7 --- /dev/null +++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Arm Statistical Profiling Extensions (SPE) support + * Copyright (c) 2017-2018, Arm Ltd. + */ + +#ifndef INCLUDE__ARM_SPE_PKT_DECODER_H__ +#define INCLUDE__ARM_SPE_PKT_DECODER_H__ + +#include + +#define ARM_SPE_PKT_DESC_MAX 256 +#define ARM_SPE_NEED_MORE_BYTES -1 +#define ARM_SPE_BAD_PACKET -2 +#define ARM_SPE_PKT_MAX_SZ 16 + +enum arm_spe_pkt_type { + ARM_SPE_BAD, + ARM_SPE_PAD, + ARM_SPE_END, + ARM_SPE_TIMESTAMP, + ARM_SPE_ADDRESS, + ARM_SPE_COUNTER, + ARM_SPE_CONTEXT, + ARM_SPE_OP_TYPE, + ARM_SPE_EVENTS, + ARM_SPE_DATA_SOURCE, +}; + +struct arm_spe_pkt { + enum arm_spe_pkt_type type; + unsigned char index; + uint64_t payload; +}; + +/* Short header (HEADER0) and extended header (HEADER1) */ +#define SPE_HEADER0_PAD 0x0 +#define SPE_HEADER0_END 0x1 +#define SPE_HEADER0_TIMESTAMP 0x71 +/* Mask for event & data source */ +#define SPE_HEADER0_MASK1 (GENMASK_ULL(7, 6) | GENMASK_ULL(3, 0)) +#define SPE_HEADER0_EVENTS 0x42 +#define SPE_HEADER0_SOURCE 0x43 +/* Mask for context & operation */ +#define SPE_HEADER0_MASK2 GENMASK_ULL(7, 2) +#define SPE_HEADER0_CONTEXT 0x64 +#define SPE_HEADER0_OP_TYPE 0x48 +/* Mask for extended format */ +#define SPE_HEADER0_EXTENDED 0x20 +/* Mask for address & counter */ +#define SPE_HEADER0_MASK3 GENMASK_ULL(7, 3) +#define SPE_HEADER0_ADDRESS 0xb0 +#define SPE_HEADER0_COUNTER 0x98 +#define SPE_HEADER1_ALIGNMENT 0x0 + +#define SPE_HDR_SHORT_INDEX(h) ((h) & GENMASK_ULL(2, 0)) +#define SPE_HDR_EXTENDED_INDEX(h0, h1) (((h0) & GENMASK_ULL(1, 0)) << 3 | \ + SPE_HDR_SHORT_INDEX(h1)) + +/* Address packet header */ +#define SPE_ADDR_PKT_HDR_INDEX_INS 0x0 +#define SPE_ADDR_PKT_HDR_INDEX_BRANCH 0x1 +#define SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT 0x2 +#define SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS 0x3 +#define SPE_ADDR_PKT_HDR_INDEX_PREV_BRANCH 0x4 +#define SPE_ADDR_PKT_HDR_INDEX_BOOST_SPE_DATA_PHYS 0x6 +#define SPE_ADDR_PKT_HDR_INDEX_REMOTE_DATA_PHYS 0x7 + +/* Address packet payload */ +#define SPE_ADDR_PKT_ADDR_BYTE7_SHIFT 56 +#define SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(v) ((v) & GENMASK_ULL(55, 0)) +#define SPE_ADDR_PKT_ADDR_GET_BYTE_6(v) (((v) & GENMASK_ULL(55, 48)) >> 48) + +#define SPE_ADDR_PKT_GET_NS(v) (((v) & BIT_ULL(63)) >> 63) +#define SPE_ADDR_PKT_GET_EL(v) (((v) & GENMASK_ULL(62, 61)) >> 61) +#define SPE_ADDR_PKT_GET_CH(v) (((v) & BIT_ULL(62)) >> 62) +#define SPE_ADDR_PKT_GET_PAT(v) (((v) & GENMASK_ULL(59, 56)) >> 56) + +#define SPE_ADDR_PKT_EL0 0 +#define SPE_ADDR_PKT_EL1 1 +#define SPE_ADDR_PKT_EL2 2 +#define SPE_ADDR_PKT_EL3 3 + +/* Boost_spe address packet payload */ +#define SPE_ADDR_PKT_ADDR_GET_BYTES_BOOST_SPE(v) ((v) & GENMASK_ULL(52, 12)) + +/* Context packet header */ +#define SPE_CTX_PKT_HDR_INDEX(h) ((h) & GENMASK_ULL(1, 0)) + +/* Counter packet header */ +#define SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT 0x0 +#define SPE_CNT_PKT_HDR_INDEX_ISSUE_LAT 0x1 +#define SPE_CNT_PKT_HDR_INDEX_TRANS_LAT 0x2 + +/* Event packet payload */ +enum arm_spe_events { + EV_EXCEPTION_GEN = 0, + EV_RETIRED = 1, + EV_L1D_ACCESS = 2, + EV_L1D_REFILL = 3, + EV_TLB_ACCESS = 4, + EV_TLB_WALK = 5, + EV_NOT_TAKEN = 6, + EV_MISPRED = 7, + EV_LLC_ACCESS = 8, + EV_LLC_MISS = 9, + EV_REMOTE_ACCESS = 10, + EV_ALIGNMENT = 11, + EV_PARTIAL_PREDICATE = 17, + EV_EMPTY_PREDICATE = 18, +}; + +/* Operation packet header */ +#define SPE_OP_PKT_HDR_CLASS(h) ((h) & GENMASK_ULL(1, 0)) +#define SPE_OP_PKT_HDR_CLASS_OTHER 0x0 +#define SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC 0x1 +#define SPE_OP_PKT_HDR_CLASS_BR_ERET 0x2 + +#define SPE_OP_PKT_IS_OTHER_SVE_OP(v) (((v) & (BIT(7) | BIT(3) | BIT(0))) == 0x8) + +#define SPE_OP_PKT_COND BIT(0) + +#define SPE_OP_PKT_LDST_SUBCLASS_GET(v) ((v) & GENMASK_ULL(7, 1)) +#define SPE_OP_PKT_LDST_SUBCLASS_GP_REG 0x0 +#define SPE_OP_PKT_LDST_SUBCLASS_SIMD_FP 0x4 +#define SPE_OP_PKT_LDST_SUBCLASS_UNSPEC_REG 0x10 +#define SPE_OP_PKT_LDST_SUBCLASS_NV_SYSREG 0x30 + +#define SPE_OP_PKT_IS_LDST_ATOMIC(v) (((v) & (GENMASK_ULL(7, 5) | BIT(1))) == 0x2) + +#define SPE_OP_PKT_AR BIT(4) +#define SPE_OP_PKT_EXCL BIT(3) +#define SPE_OP_PKT_AT BIT(2) +#define SPE_OP_PKT_ST BIT(0) + +#define SPE_OP_PKT_IS_LDST_SVE(v) (((v) & (BIT(3) | BIT(1))) == 0x8) + +#define SPE_OP_PKT_SVE_SG BIT(7) +/* + * SVE effective vector length (EVL) is stored in byte 0 bits [6:4]; + * the length is rounded up to a power of two and use 32 as one step, + * so EVL calculation is: + * + * 32 * (2 ^ bits [6:4]) = 32 << (bits [6:4]) + */ +#define SPE_OP_PKG_SVE_EVL(v) (32 << (((v) & GENMASK_ULL(6, 4)) >> 4)) +#define SPE_OP_PKT_SVE_PRED BIT(2) +#define SPE_OP_PKT_SVE_FP BIT(1) + +#define SPE_OP_PKT_IS_INDIRECT_BRANCH(v) (((v) & GENMASK_ULL(7, 1)) == 0x2) + +const char *arm_spe_pkt_name(enum arm_spe_pkt_type); + +int arm_spe_get_packet(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet); + +int arm_spe_pkt_desc(const struct arm_spe_pkt *packet, char *buf, size_t len); +#endif diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 05dda19c5359a39849ac9a3c6b1a5cf14eb0614a..85e72a392a31afefd5915040d9777187ca08350e 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -16,6 +16,10 @@ #include #include +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) +#include "../drivers/arm/mm_monitor/mm_spe.h" +#endif + static DEFINE_PER_CPU(struct arm_pmu *, probed_pmus); static DEFINE_PER_CPU(int, pmu_irqs); @@ -162,6 +166,32 @@ static inline void arm_spe_acpi_register_device(void) { } #endif /* CONFIG_ARM_SPE_PMU */ +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) +static struct resource spe_mem_sampling_resources[] = { + { + } +}; + +static struct platform_device spe_mem_sampling_dev = { + .name = ARMV8_SPE_MEM_SAMPLING_PDEV_NAME, + .id = -1, + .resource = spe_mem_sampling_resources, + .num_resources = ARRAY_SIZE(spe_mem_sampling_resources) +}; + +static void arm_spe_mem_sampling_acpi_register_device(void) +{ + int ret; + + ret = platform_device_register(&spe_mem_sampling_dev); + if (ret < 0) + pr_warn("ACPI: SPE_MEM_SAMPLING: Unable to register device\n"); +} +#else +static inline void arm_spe_mem_sampling_acpi_register_device(void) +{ +} +#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */ #if IS_ENABLED(CONFIG_CORESIGHT_TRBE) static struct resource trbe_resources[] = { @@ -432,6 +462,7 @@ static int arm_pmu_acpi_init(void) return 0; arm_spe_acpi_register_device(); + arm_spe_mem_sampling_acpi_register_device(); arm_trbe_acpi_register_device(); return 0; diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 71835682046e133d4a17044e12c018842337ec0b..8562385d901e6c43a7902d12646135e4afb12650 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -33,12 +33,19 @@ #include #include #include +#if IS_ENABLED(CONFIG_MEM_SAMPLING) +#include +#endif #include #include #include #include +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) +#include "../drivers/arm/mm_monitor/mm_spe.h" +#endif + /* * Cache if the event is allowed to trace Context information. * This allows us to perform the check, i.e, perf_allow_kernel(), @@ -46,6 +53,10 @@ */ #define SPE_PMU_HW_FLAGS_CX 0x00001 +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) +static struct arm_spe_pmu *spe_pmu_local; +#endif + static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX); static void set_spe_event_has_cx(struct perf_event *event) @@ -583,13 +594,21 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle) * If we've lost data, disable profiling and also set the PARTIAL * flag to indicate that the last record is corrupted. */ +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + if (!__this_cpu_read(arm_spe_user) && FIELD_GET(PMBSR_EL1_DL, pmbsr)) +#else if (FIELD_GET(PMBSR_EL1_DL, pmbsr)) +#endif perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED | PERF_AUX_FLAG_PARTIAL); /* Report collisions to userspace so that it can up the period */ +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + if (!__this_cpu_read(arm_spe_user) && FIELD_GET(PMBSR_EL1_DL, pmbsr)) +#else if (FIELD_GET(PMBSR_EL1_COLL, pmbsr)) perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION); +#endif /* We only expect buffer management events */ switch (FIELD_GET(PMBSR_EL1_EC, pmbsr)) { @@ -622,7 +641,12 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle) ret = SPE_PMU_BUF_FAULT_ACT_FATAL; out_stop: +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + if (!__this_cpu_read(arm_spe_user)) + arm_spe_perf_aux_output_end(handle); +#else arm_spe_perf_aux_output_end(handle); +#endif return ret; } @@ -632,7 +656,11 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) struct perf_event *event = handle->event; enum arm_spe_pmu_buf_fault_action act; +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + if (!__this_cpu_read(arm_spe_user) && !perf_get_aux(handle)) +#else if (!perf_get_aux(handle)) +#endif return IRQ_NONE; act = arm_spe_pmu_buf_get_fault_act(handle); @@ -643,7 +671,12 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) * Ensure perf callbacks have completed, which may disable the * profiling buffer in response to a TRUNCATION flag. */ +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + if (!__this_cpu_read(arm_spe_user)) + irq_work_run(); +#else irq_work_run(); +#endif switch (act) { case SPE_PMU_BUF_FAULT_ACT_FATAL: @@ -663,6 +696,12 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) * PMBPTR might be misaligned, but we'll burn that bridge * when we get to it. */ +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + if (__this_cpu_read(arm_spe_user)) { + mem_sampling_process(); + break; + } +#endif if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) { arm_spe_perf_aux_output_begin(handle, event); isb(); @@ -758,6 +797,10 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle); +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + arm_spe_set_user(SPE_USER_PERF); +#endif + hwc->state = 0; arm_spe_perf_aux_output_begin(handle, event); if (hwc->state) @@ -797,8 +840,16 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle); /* If we're already stopped, then nothing to do */ - if (hwc->state & PERF_HES_STOPPED) + if (hwc->state & PERF_HES_STOPPED) { +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + /* + * PERF_HES_STOPPED maybe set in arm_spe_perf_aux_output_begin, + * we switch user here. + */ + arm_spe_set_user(SPE_USER_MEM_SAMPLING); +#endif return; + } /* Stop all trace generation */ arm_spe_pmu_disable_and_drain_local(); @@ -829,6 +880,9 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) } hwc->state |= PERF_HES_STOPPED; +#if IS_ENABLED(CONFIG_MEM_SAMPLING) + arm_spe_set_user(SPE_USER_MEM_SAMPLING); +#endif } static int arm_spe_pmu_add(struct perf_event *event, int flags) @@ -1129,6 +1183,9 @@ static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node) if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) return 0; +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) + mm_spe_percpu_buffer_alloc(cpu); +#endif __arm_spe_pmu_setup_one(spe_pmu); return 0; } @@ -1141,6 +1198,9 @@ static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) return 0; +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) + mm_spe_percpu_buffer_free(cpu); +#endif __arm_spe_pmu_stop_one(spe_pmu); return 0; } @@ -1176,6 +1236,9 @@ static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu) static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu) { +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) + mm_spe_buffer_free(); +#endif cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node); free_percpu_irq(spe_pmu->irq, spe_pmu->handle); } @@ -1215,6 +1278,26 @@ static const struct platform_device_id arm_spe_match[] = { }; MODULE_DEVICE_TABLE(platform, arm_spe_match); +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) +static bool arm_spe_get_attr(void) +{ + struct mm_spe *p; + + p = mm_spe_get_desc(); + if (!p) { + pr_err("get spe pmu cap from arm spe driver failed!\n"); + return false; + } + + p->supported_cpus = spe_pmu_local->supported_cpus; + p->irq = spe_pmu_local->irq; + p->features = spe_pmu_local->features; + p->min_period = spe_pmu_local->min_period; + + return true; +} +#endif + static int arm_spe_pmu_device_probe(struct platform_device *pdev) { int ret; @@ -1249,6 +1332,21 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev) if (ret) goto out_free_handle; +#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING) + /* + * Ensure that all CPUs that support SPE can apply for the cache + * area, with each CPU defaulting to 4K * 2. Failure to do so will + * result in the inability to collect SPE data in kernel mode. + */ + ret = mm_spe_buffer_alloc(); + if (ret) + goto out_teardown_dev; + + spe_pmu_local = spe_pmu; + if (arm_spe_get_attr()) + mm_spe_add_probe_status(); + +#endif ret = arm_spe_pmu_perf_init(spe_pmu); if (ret) goto out_teardown_dev; @@ -1262,6 +1360,18 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev) return ret; } +#if IS_ENABLED(CONFIG_MEM_SAMPLING) +void arm_spe_set_user(enum arm_spe_user_e user) +{ + __this_cpu_write(arm_spe_user, user); + if (user == SPE_USER_PERF) + mem_sampling_user_switch_process(USER_SWITCH_AWAY_FROM_MEM_SAMPLING); + else + mem_sampling_user_switch_process(USER_SWITCH_BACK_TO_MEM_SAMPLING); + __arm_spe_pmu_reset_local(); +} +#endif + static int arm_spe_pmu_device_remove(struct platform_device *pdev) { struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev); diff --git a/include/linux/damon.h b/include/linux/damon.h index 343132a146cf04b546a07eb6c653608d55aac9c1..e544de649dc3ec1db69b4054eeec6e0503ab4576 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -13,6 +13,7 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE @@ -73,6 +74,9 @@ struct damon_region { */ struct damon_target { struct pid *pid; +#ifdef CONFIG_DAMON_MEM_SAMPLING + struct damon_mem_sampling_fifo damon_fifo; +#endif unsigned int nr_regions; struct list_head regions_list; struct list_head list; @@ -89,6 +93,7 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_DEMOTION: Migrate cold page areas to specific nodes. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * @@ -106,6 +111,7 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_DEMOTION, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; @@ -298,6 +304,7 @@ struct damos_access_pattern { * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. + * @remote_node: The NUMA node ID from which the cold page will be moved. * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. @@ -330,6 +337,7 @@ struct damos_access_pattern { struct damos { struct damos_access_pattern pattern; enum damos_action action; + nodemask_t remote_node; unsigned long apply_interval_us; /* private: internal use only */ /* diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h new file mode 100644 index 0000000000000000000000000000000000000000..602a5efcb77c9840becd7cdce69c24ef1841086f --- /dev/null +++ b/include/linux/mem_sampling.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * mem_sampling.h: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ +#ifndef __MEM_SAMPLING_H +#define __MEM_SAMPLING_H + +#include + +enum mem_sampling_sample_type { + MEM_SAMPLING_L1D_ACCESS = 1 << 0, + MEM_SAMPLING_L1D_MISS = 1 << 1, + MEM_SAMPLING_LLC_ACCESS = 1 << 2, + MEM_SAMPLING_LLC_MISS = 1 << 3, + MEM_SAMPLING_TLB_ACCESS = 1 << 4, + MEM_SAMPLING_TLB_MISS = 1 << 5, + MEM_SAMPLING_BRANCH_MISS = 1 << 6, + MEM_SAMPLING_REMOTE_ACCESS = 1 << 7, +}; + +enum mem_sampling_op_type { + MEM_SAMPLING_LD = 1 << 0, + MEM_SAMPLING_ST = 1 << 1, +}; + +enum arm_spe_user_e { + SPE_USER_PERF, + SPE_USER_MEM_SAMPLING, +}; +DECLARE_PER_CPU(enum arm_spe_user_e, arm_spe_user); + +struct mem_sampling_record { + enum mem_sampling_sample_type type; + int err; + u32 op; + u32 latency; + u64 from_ip; + u64 to_ip; + u64 timestamp; + u64 virt_addr; + u64 phys_addr; + u64 context_id; + u64 boost_spe_addr[8]; + u64 rem_addr; + u16 source; +}; + +struct mem_sampling_ops_struct { + int (*sampling_start)(void); + void (*sampling_stop)(void); + void (*sampling_continue)(void); + void (*sampling_decoding)(void); + struct mm_spe_buf* (*mm_spe_getbuf_addr)(void); + int (*mm_spe_getnum_record)(void); + +}; +extern struct mem_sampling_ops_struct mem_sampling_ops; + +enum mem_sampling_type_enum { + MEM_SAMPLING_ARM_SPE, + MEM_SAMPLING_UNSUPPORTED +}; + +enum user_switch_type { + USER_SWITCH_AWAY_FROM_MEM_SAMPLING, + USER_SWITCH_BACK_TO_MEM_SAMPLING, +}; + +DECLARE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling); +extern struct static_key_false mem_sampling_access_hints; + +#ifdef CONFIG_ARM_SPE_MEM_SAMPLING +int mm_spe_start(void); +void mm_spe_stop(void); +void mm_spe_continue(void); +void mm_spe_decoding(void); +int mm_spe_getnum_record(void); +struct mm_spe_buf *mm_spe_getbuf_addr(void); +int mm_spe_enabled(void); +void arm_spe_set_probe_status(int status); +#else +static inline void mm_spe_stop(void) { } +static inline void mm_spe_continue(void) { } +static inline void mm_spe_decoding(void) { } +static inline void arm_spe_set_probe_status(int status) { } +static inline int mm_spe_start(void) { return 0; } +static inline int mm_spe_getnum_record(void) { return 0; } +static inline struct mm_spe_buf *mm_spe_getbuf_addr(void) { return NULL; } +static inline int mm_spe_enabled(void) { return 0; } +#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */ + +#if IS_ENABLED(CONFIG_MEM_SAMPLING) +void mem_sampling_process(void); +void arm_spe_set_user(enum arm_spe_user_e user); +void set_mem_sampling_state(bool enabled); +void mem_sampling_user_switch_process(enum user_switch_type type); +void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr); +#else +static inline void set_mem_sampling_state(bool enabled) { } +static inline void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) { } +#endif /* CONFIG_MEM_SAMPLING */ + +#ifdef CONFIG_DAMON_MEM_SAMPLING +#define DAMOS_FIFO_MAX_RECORD (1024) +struct damon_mem_sampling_record { + u64 vaddr; +}; + +struct damon_mem_sampling_fifo { + struct kfifo rx_kfifo; + spinlock_t rx_kfifo_lock; /* protect SPE Rx data kfifo */ +}; + +bool damon_use_mem_sampling(void); +#else +static inline bool damon_use_mem_sampling(void) { return false; } +#endif /* CONFIG_DAMON_MEM_SAMPLING */ +#endif /* __MEM_SAMPLING_H */ diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 2e81ac87e6f6d91610efb3daafa2c33187bab8b3..29cc0d842a8fa81ec2dad3a5d4357a2bb94c45e0 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -167,6 +167,8 @@ static inline void check_highest_zone(enum zone_type k) int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); +int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, unsigned long start, unsigned long end, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index f37cc03f9369ed61e49f7be69564fa017ab51801..302c659dc626c92ba258c62671f4cca97412209c 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -29,6 +29,7 @@ enum migrate_reason { MR_CONTIG_RANGE, MR_LONGTERM_PIN, MR_DEMOTION, + MR_DAMON_DEMOTION, MR_TYPES }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b4442fbbf17bf7df534f03c3807d762dc17209c6..64c38b09e18d5579dd362cc160f68d6535c70428 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1011,7 +1011,11 @@ struct mm_struct { #endif } __randomize_layout; +#ifdef CONFIG_DAMON_MEM_SAMPLING + KABI_USE(1, struct damon_mem_sampling_fifo *damon_fifo) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index a4e40ae6a8c8fd2e51cdbf877cf3bb485690cc9a..4bcbf613c9a3852256737fd19e1bf8937a172ccf 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -8,6 +8,7 @@ #include #include #include +#include TRACE_EVENT(kmem_cache_alloc, @@ -409,6 +410,117 @@ TRACE_EVENT(rss_stat, __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); +#ifdef CONFIG_ARM_SPE_MEM_SAMPLING +TRACE_EVENT(mm_spe_record, + TP_PROTO(struct mem_sampling_record *record), + + TP_ARGS(record), + + TP_STRUCT__entry( + __field(u64, vaddr) + __field(u64, paddr) + __field(int, pid) + ), + + TP_fast_assign( + __entry->vaddr = record->virt_addr; + __entry->paddr = record->phys_addr; + __entry->pid = record->context_id; + + ), + + TP_printk("vaddr=%llu paddr=%llu pid=%d", + __entry->vaddr, __entry->paddr, __entry->pid) +); + +TRACE_EVENT(spe_boost_spe_record, + TP_PROTO(struct mem_sampling_record *record), + + TP_ARGS(record), + + TP_STRUCT__entry( + __field(u64, boost_spe_pa1) + __field(u64, boost_spe_pa2) + __field(u64, boost_spe_pa3) + __field(u64, boost_spe_pa4) + __field(u64, boost_spe_pa5) + __field(u64, boost_spe_pa6) + __field(u64, boost_spe_pa7) + __field(u64, boost_spe_pa8) + ), + + TP_fast_assign( + __entry->boost_spe_pa1 = record->boost_spe_addr[0]; + __entry->boost_spe_pa2 = record->boost_spe_addr[1]; + __entry->boost_spe_pa3 = record->boost_spe_addr[2]; + __entry->boost_spe_pa4 = record->boost_spe_addr[3]; + __entry->boost_spe_pa5 = record->boost_spe_addr[4]; + __entry->boost_spe_pa6 = record->boost_spe_addr[5]; + __entry->boost_spe_pa7 = record->boost_spe_addr[6]; + __entry->boost_spe_pa8 = record->boost_spe_addr[7]; + ), + + TP_printk("boost_spe_addr[0]=0x%llx boost_spe_addr[1]=0x%llx tlb_addr[2]=0x%llx tlb_addr[3]=0x%llx tlb_addr[4]=0x%llx tlb_addr[5]=0x%llx tlb_addr[6]=0x%llx tlb_addr[7]=0x%llx", + __entry->boost_spe_pa1, __entry->boost_spe_pa2, + __entry->boost_spe_pa3, __entry->boost_spe_pa4, + __entry->boost_spe_pa5, __entry->boost_spe_pa6, + __entry->boost_spe_pa7, __entry->boost_spe_pa8) +); +#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */ + + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +TRACE_EVENT(mm_numa_migrating, + + TP_PROTO(u64 vaddr, int page_nid, int target_nid, + int migrate_success), + + TP_ARGS(vaddr, page_nid, target_nid, migrate_success), + + TP_STRUCT__entry( + __field(u64, vaddr) + __field(int, page_nid) + __field(int, target_nid) + __field(int, migrate_success) + ), + + TP_fast_assign( + __entry->vaddr = vaddr; + __entry->page_nid = page_nid; + __entry->target_nid = target_nid; + __entry->migrate_success = !!(migrate_success); + ), + + TP_printk("vaddr=%llu page_nid=%d target_nid=%d migrate_success=%d", + __entry->vaddr, __entry->page_nid, + __entry->target_nid, __entry->migrate_success) +); + +TRACE_EVENT(mm_mem_sampling_access_record, + + TP_PROTO(u64 vaddr, u64 paddr, int cpuid, int pid), + + TP_ARGS(vaddr, paddr, cpuid, pid), + + TP_STRUCT__entry( + __field(u64, vaddr) + __field(u64, paddr) + __field(int, cpuid) + __field(int, pid) + ), + + TP_fast_assign( + __entry->vaddr = vaddr; + __entry->paddr = paddr; + __entry->cpuid = cpuid; + __entry->pid = pid; + ), + + TP_printk("vaddr=%llu paddr=%llu cpuid=%d pid=%d", + __entry->vaddr, __entry->paddr, + __entry->cpuid, __entry->pid) +); +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 0190ef725b439696973eaefb75e453c360535117..bafe4208de73328a09258eac0385a4c14571fbfb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -22,7 +22,8 @@ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ EM( MR_LONGTERM_PIN, "longterm_pin") \ - EMe(MR_DEMOTION, "demotion") + EM(MR_DEMOTION, "demotion") \ + EMe(MR_DAMON_DEMOTION, "damon_demotion") /* * First define the enums in the above macros to be exported to userspace diff --git a/kernel/fork.c b/kernel/fork.c index 698d7829f2e448d5684fad1d8ad0c593c9755c3e..4b37cb915f7b8d0d9a8e7571bf92a259a9d25287 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1362,6 +1362,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; +#endif +#if defined(CONFIG_DAMON_MEM_SAMPLING) + mm->damon_fifo = NULL; #endif mm_init_uprobes_state(mm); hugetlb_count_init(mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7dc4ceebd5ec0e9ecddb4fd28ec139aa63858d02..7c40690ad56f4e4ad4d2986ce71e96923f6540ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -64,6 +64,7 @@ #include #include #include +#include #ifdef CONFIG_PREEMPT_DYNAMIC # ifdef CONFIG_GENERIC_ENTRY @@ -5307,6 +5308,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + mem_sampling_sched_in(prev, current); finish_task(prev); tick_nohz_task_switch(); finish_lock_switch(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c530d501bb48637e6b5c6ac41eba34b73f745c9f..468a4d747933678015a5420df79f8b19e98b1fe7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -48,6 +48,7 @@ #include #include #include +#include #include @@ -3368,6 +3369,18 @@ static void task_numa_work(struct callback_head *work) long pages, virtpages; struct vma_iterator vmi; +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + /* + * If we are using access hints from hardware (like using + * SPE), don't scan the address space. + * Note that currently PMD-level page migration is not + * supported. + */ + if (static_branch_unlikely(&mem_sampling_access_hints) && + static_branch_unlikely(&sched_numabalancing_mem_sampling)) + return; +#endif + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); work->next = work; diff --git a/mm/Kconfig b/mm/Kconfig index 56171b9dd8730088ab7028d2902183d0ce9a6290..88addd002bb5d29f6bb0361c03e6c2007e326c1f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1452,6 +1452,34 @@ config BPF_READAHEAD of the kernel is adjusted based on the application read mode to optimize the read performance in the Spark SQL scenario, +config MEM_SAMPLING + bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)" + default n + depends on ARM64 + select ARM_SPE_MEM_SAMPLING if ARM64 + help + This option enables hardware-based memory sampling for kernel features + such as NUMA balancing and DAMON. If disabled, software-based memory + sampling will be used instead. + + Memory sampling is primarily based on specific hardware capabilities, + which enable hardware PMUs to sample memory access for use by kernel + features. It requires at least one hardware PMU (e.g. ARM_SPE_MEM_SAMPLING) + to be enabled. + +config NUMABALANCING_MEM_SAMPLING + bool "Use hardware memory samples for numa balancing" + depends on MEM_SAMPLING && NUMA_BALANCING + default n + help + This feature relies on hardware sampling, and will use memory access + information obtained from hardware sampling in the NUMA balancing + policy instead of the native software PROT_NONE scheme. Turning on + this feature may have a performance impact on some workloads, for + example, lightweight memory access programs. + + if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 11df2de8fdbe9d5a70e4ca3a73db68ebd9c9d331..674777b7c99ff0952edadda1c2ebb3b0b12e9f1b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -147,3 +147,4 @@ obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_DYNAMIC_POOL) += dynamic_pool.o +obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 436c6b4cb5ec57fb7e47d093569ec093d7ec7401..d6ed1ef6ad4a5e76210c2daad5207be9b2743bfd 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -32,6 +32,20 @@ config DAMON_VADDR This builds the default data access monitoring operations for DAMON that work for virtual address spaces. +config DAMON_MEM_SAMPLING + bool "Set DAMON to use records from hardware sample" + depends on MEM_SAMPLING && DAMON_VADDR + help + This enables DAMON to utilize hardware sampling-based memory access + monitoring data (e.g., ARM SPE, Intel PEBS, AMD IBS) instead of + software-based sampling. When enabled, DAMON will: + + - Use CPU performance monitoring unit (PMU) samples as data source + - Correlate hardware samples with process virtual address spaces + - Provide lower overhead monitoring compared to pure software approaches + + If unsure, say N. + config DAMON_PADDR bool "Data access monitoring operations for the physical address space" depends on DAMON && MMU diff --git a/mm/damon/core.c b/mm/damon/core.c index 1daa8793c44b3c1256e0255d158b3d33741c5535..c8a4427d1d630617072a076ae311ad640c422619 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -112,6 +112,32 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return err; } +#if IS_ENABLED(CONFIG_DAMON_MEM_SAMPLING) +int damon_target_init_kfifo(struct damon_target *t) +{ + struct damon_mem_sampling_fifo *damon_fifo; + int ret = 0; + unsigned int fifo_size = sizeof(struct damon_mem_sampling_record) * DAMOS_FIFO_MAX_RECORD; + + damon_fifo = &t->damon_fifo; + + ret = kfifo_alloc(&damon_fifo->rx_kfifo, fifo_size, GFP_KERNEL); + if (ret) + return -ENOMEM; + + spin_lock_init(&damon_fifo->rx_kfifo_lock); + return 0; +} + +void damon_target_deinit_kfifo(struct damon_target *t) +{ + kfifo_free(&t->damon_fifo.rx_kfifo); +} +#else +static inline int damon_target_init_kfifo(struct damon_target *t) {return 0; } +static inline void damon_target_deinit_kfifo(struct damon_target *t) { } +#endif /* CONFIG_DAMON_MEM_SAMPLING */ + /* * Construct a damon_region struct * @@ -388,11 +414,18 @@ void damon_destroy_scheme(struct damos *s) struct damon_target *damon_new_target(void) { struct damon_target *t; + int ret; t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) return NULL; + ret = damon_target_init_kfifo(t); + if (ret) { + kfree(t); + return NULL; + } + t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); @@ -422,6 +455,7 @@ void damon_free_target(struct damon_target *t) damon_for_each_region_safe(r, next, t) damon_free_region(r); + damon_target_deinit_kfifo(t); kfree(t); } diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 26c948f87489ee1cdf9d97de9877ebc23185a8ed..dc570e90abca73c275cc3609467c6ca6c4678de8 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1123,6 +1123,7 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = { struct damon_sysfs_scheme { struct kobject kobj; enum damos_action action; + nodemask_t remote_node; struct damon_sysfs_access_pattern *access_pattern; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; @@ -1140,6 +1141,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { "nohugepage", "lru_prio", "lru_deprio", + "demotion", "stat", }; @@ -1153,6 +1155,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( return NULL; scheme->kobj = (struct kobject){}; scheme->action = action; + scheme->remote_node = NODE_MASK_ALL; return scheme; } @@ -1356,6 +1359,36 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, return -EINVAL; } +static ssize_t remote_node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%*pbl\n", + nodemask_pr_args(&scheme->remote_node)); +} + +static ssize_t remote_node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + int ret; + nodemask_t new_mask; + + ret = nodelist_parse(buf, new_mask); + if (ret < 0) + return -EINVAL; + + if (!nodes_subset(new_mask, node_states[N_MEMORY])) + return -EINVAL; + + nodes_and(scheme->remote_node, new_mask, node_states[N_MEMORY]); + return count; +} + + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -1364,8 +1397,12 @@ static void damon_sysfs_scheme_release(struct kobject *kobj) static struct kobj_attribute damon_sysfs_scheme_action_attr = __ATTR_RW_MODE(action, 0600); +static struct kobj_attribute damon_sysfs_scheme_remote_node_attr = + __ATTR_RW_MODE(remote_node, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, + &damon_sysfs_scheme_remote_node_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -1644,6 +1681,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->pattern.max_age_region = access_pattern->age->max; scheme->action = sysfs_scheme->action; + scheme->remote_node = sysfs_scheme->remote_node; scheme->quota.ms = sysfs_quotas->ms; scheme->quota.sz = sysfs_quotas->sz; @@ -1687,6 +1725,8 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx, damon_destroy_scheme(scheme); return -ENOMEM; } + + scheme->remote_node = sysfs_schemes->schemes_arr[i]->remote_node; damon_add_scheme(ctx, scheme); } return 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 5764b9885e7d215cbae84bc64c2b79508c2d260d..3a21410e631e7e894d45bb6c48ad7c34a9bbe9e9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "ops-common.h" @@ -402,6 +403,118 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) mmap_read_unlock(mm); } +#if IS_ENABLED(CONFIG_DAMON_MEM_SAMPLING) +/* + * Functions for the access checking of the regions with mem sampling + */ +static void __hw_damon_va_prepare_access_check(struct damon_region *r) +{ + r->sampling_addr = 0; +} + +static void hw_damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + mm->damon_fifo = &t->damon_fifo; + damon_for_each_region(r, t) + __hw_damon_va_prepare_access_check(r); + mmput(mm); + } +} + +static void find_damon_region(struct damon_mem_sampling_record *damon_record, + struct damon_target *t, unsigned int *max_nr_accesses) +{ + struct damon_region *r; + unsigned long addr = damon_record->vaddr; + + damon_for_each_region(r, t) { + if (r->sampling_addr != 0) + return; + if (addr > r->ar.start && addr < r->ar.end) { + r->nr_accesses++; + r->sampling_addr = addr; + *max_nr_accesses = max(r->nr_accesses, *max_nr_accesses); + return; + } + } +} + +static unsigned int hw_damon_va_check_accesses(struct damon_ctx *ctx) +{ + unsigned int outs; + struct damon_target *t; + struct mm_struct *mm; + unsigned int max_nr_accesses = 0; + struct damon_mem_sampling_record damon_record; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + mm->damon_fifo = NULL; + mmput(mm); + while (!kfifo_is_empty(&t->damon_fifo.rx_kfifo)) { + outs = kfifo_out(&t->damon_fifo.rx_kfifo, &damon_record, + sizeof(struct damon_mem_sampling_record)); + if (outs != sizeof(struct damon_mem_sampling_record)) { + pr_debug("damon hw spe record corrupted header. Flush.\n"); + continue; + } + find_damon_region(&damon_record, t, &max_nr_accesses); + } + kfifo_reset_out(&t->damon_fifo.rx_kfifo); + } + + return max_nr_accesses; +} +#else +static inline void hw_damon_va_prepare_access_checks(struct damon_ctx *ctx) { } +static inline unsigned int hw_damon_va_check_accesses(struct damon_ctx *ctx) {return 0; } +#endif + +#ifdef CONFIG_MIGRATION +static unsigned long damon_migrate_pages(struct damon_target *t, + struct damon_region *r, nodemask_t task_remote_nodes) +{ + struct mm_struct *mm = NULL; + unsigned long applied; + struct task_struct *task; + nodemask_t task_nodes; + + task = damon_get_task_struct(t); + if (!task) + return 0; + task_nodes = cpuset_mems_allowed(task); + put_task_struct(task); + + mm = damon_get_mm(t); + if (!mm) + return 0; + + applied = do_migrate_area_pages(mm, &task_nodes, &task_remote_nodes, + r->ar.start, r->ar.end, MPOL_MF_MOVE_ALL); + + mmput(mm); + + return applied; +} + +#else +static inline unsigned long damon_migrate_pages(struct damon_target *t, + struct damon_region *r, nodemask_t task_remote_nodes) +{ + return 0; +} +#endif /* CONFIG_MIGRATION */ + /* * Functions for the access checking of the regions */ @@ -420,6 +533,11 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) struct mm_struct *mm; struct damon_region *r; + if (damon_use_mem_sampling()) { + hw_damon_va_prepare_access_checks(ctx); + return; + } + damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) @@ -589,6 +707,11 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) unsigned int max_nr_accesses = 0; bool same_target; + if (damon_use_mem_sampling()) { + max_nr_accesses = hw_damon_va_check_accesses(ctx); + return max_nr_accesses; + } + damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) @@ -670,6 +793,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_DEMOTION: + return damon_migrate_pages(t, r, scheme->remote_node); case DAMOS_STAT: return 0; default: @@ -690,6 +815,8 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_cold_score(context, r, scheme); + case DAMOS_DEMOTION: + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c new file mode 100644 index 0000000000000000000000000000000000000000..126cf71a9fb25dd2a62b5fe265a6fe10c6aed533 --- /dev/null +++ b/mm/mem_sampling.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mem_sampling.c: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ + +#define pr_fmt(fmt) "mem_sampling: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEM_SAMPLING_DISABLED 0x0 +#define MEM_SAMPLING_NORMAL 0x1 +#define MEM_SAMPLING_MIN_VALUE 0 +#define MEM_SAMPLING_MAX_VALUE 5 + +struct mem_sampling_ops_struct mem_sampling_ops; +static int mem_sampling_override __initdata; +static int sysctl_mem_sampling_mode; + +static const int mem_sampling_min_value = MEM_SAMPLING_MIN_VALUE; +static const int mem_sampling_max_value = MEM_SAMPLING_MAX_VALUE; + +/* keep track of who use the SPE */ +DEFINE_PER_CPU(enum arm_spe_user_e, arm_spe_user); +EXPORT_PER_CPU_SYMBOL_GPL(arm_spe_user); + +enum mem_sampling_saved_state_e { + MEM_SAMPLING_STATE_ENABLE, + MEM_SAMPLING_STATE_DISABLE, + MEM_SAMPLING_STATE_EMPTY, +}; +enum mem_sampling_saved_state_e mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY; + +/* + * Callbacks should be registered using mem_sampling_record_cb_register() + * by NUMA, DAMON and etc during their initialisation. + * Callbacks will be invoked on new hardware pmu records caputured. + */ +typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record); + +struct mem_sampling_record_cb_list_entry { + struct list_head list; + mem_sampling_record_cb_type cb; +}; +LIST_HEAD(mem_sampling_record_cb_list); + +struct mem_sampling_numa_access_work { + struct callback_head work; + u64 vaddr, paddr; + int cpu; +}; + +void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) + return; + } + + cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL); + if (!cb_entry) + return; + + cb_entry->cb = cb; + list_add(&(cb_entry->list), &mem_sampling_record_cb_list); +} + +void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) { + list_del(&cb_entry->list); + kfree(cb_entry); + return; + } + } +} + +DEFINE_STATIC_KEY_FALSE(mem_sampling_access_hints); +void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) +{ + if (!static_branch_unlikely(&mem_sampling_access_hints)) + return; + + if (!mem_sampling_ops.sampling_start) + return; + + if (curr->mm) + mem_sampling_ops.sampling_start(); + else + mem_sampling_ops.sampling_stop(); +} + +DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling); +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +static int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags) +{ + folio_get(folio); + + /* Record the current PID acceesing VMA */ + vma_set_access_pid_bit(vma); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == numa_node_id()) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } + + return mpol_misplaced(folio, vma, addr); +} + +/* + * Called from task_work context to act upon the page access. + * + * Physical address (provided by SPE) is used directly instead + * of walking the page tables to get to the PTE/page. Hence we + * don't check if PTE is writable for the TNF_NO_GROUP + * optimization, which means RO pages are considered for grouping. + */ +static void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr) +{ + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + struct page *page = NULL; + struct folio *folio; + int page_nid = NUMA_NO_NODE; + int last_cpupid; + int target_nid; + int flags = 0; + + if (!mm) + return; + + if (!mmap_read_trylock(mm)) + return; + + vma = find_vma(mm, laddr); + if (!vma) + goto out_unlock; + + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) + goto out_unlock; + + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + goto out_unlock; + + if (!vma_is_accessible(vma)) + goto out_unlock; + + page = pfn_to_online_page(PHYS_PFN(paddr)); + folio = page_folio(page); + + if (!folio || folio_is_zone_device(folio)) + goto out_unlock; + + if (unlikely(!PageLRU(page))) + goto out_unlock; + + /* TODO: handle PTE-mapped THP or PMD-mapped THP*/ + if (folio_test_large(folio)) + goto out_unlock; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + page_nid = folio_nid(folio); + + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (folio_use_access_time(folio)) + last_cpupid = (-1 & LAST_CPUPID_MASK); + else + last_cpupid = folio_last_cpupid(folio); + target_nid = numa_migrate_prep(folio, vma, laddr, page_nid, &flags); + if (target_nid == NUMA_NO_NODE) { + folio_put(folio); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_folio(folio, vma, target_nid)) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } else { + flags |= TNF_MIGRATE_FAIL; + } + +out: + trace_mm_numa_migrating(laddr, page_nid, target_nid, flags&TNF_MIGRATED); + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, 1, flags); + +out_unlock: + mmap_read_unlock(mm); +} + +static void task_mem_sampling_access_work(struct callback_head *work) +{ + struct mem_sampling_numa_access_work *iwork = + container_of(work, struct mem_sampling_numa_access_work, work); + + if (iwork->cpu == smp_processor_id()) + do_numa_access(current, iwork->vaddr, iwork->paddr); + kfree(iwork); +} + +static void numa_create_taskwork(u64 vaddr, u64 paddr, int cpu) +{ + struct mem_sampling_numa_access_work *iwork = NULL; + + iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC); + if (!iwork) + return; + + iwork->vaddr = vaddr; + iwork->paddr = paddr; + iwork->cpu = cpu; + + init_task_work(&iwork->work, task_mem_sampling_access_work); + task_work_add(current, &iwork->work, TWA_RESUME); +} + +static void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record) +{ + struct task_struct *p = current; + u64 vaddr = record->virt_addr; + u64 paddr = record->phys_addr; + + /* Discard kernel address accesses */ + if (vaddr & (1UL << 63)) + return; + + if (p->pid != record->context_id) + return; + + trace_mm_mem_sampling_access_record(vaddr, paddr, smp_processor_id(), + current->pid); + numa_create_taskwork(vaddr, paddr, smp_processor_id()); +} + +static void numa_balancing_mem_sampling_cb_register(void) +{ + mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb); +} + +static void numa_balancing_mem_sampling_cb_unregister(void) +{ + mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb); +} +static void set_numabalancing_mem_sampling_state(bool enabled) +{ + if (enabled) { + numa_balancing_mem_sampling_cb_register(); + static_branch_enable(&sched_numabalancing_mem_sampling); + } else { + numa_balancing_mem_sampling_cb_unregister(); + static_branch_disable(&sched_numabalancing_mem_sampling); + } +} +#else +static inline void set_numabalancing_mem_sampling_state(bool enabled) { } +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + +DEFINE_STATIC_KEY_FALSE(mm_damon_mem_sampling); +#ifdef CONFIG_DAMON_MEM_SAMPLING +static void damon_mem_sampling_record_cb(struct mem_sampling_record *record) +{ + struct damon_mem_sampling_fifo *damon_fifo; + struct damon_mem_sampling_record domon_record; + struct task_struct *task = NULL; + struct mm_struct *mm; + + /* Discard kernel address accesses */ + if (record->virt_addr & (1UL << 63)) + return; + + task = find_get_task_by_vpid((pid_t)record->context_id); + if (!task) + return; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return; + + damon_fifo = mm->damon_fifo; + mmput(mm); + + domon_record.vaddr = record->virt_addr; + + /* only the proc under monitor now has damon_fifo */ + if (damon_fifo) { + if (kfifo_is_full(&damon_fifo->rx_kfifo)) + return; + + kfifo_in_locked(&damon_fifo->rx_kfifo, &domon_record, + sizeof(struct damon_mem_sampling_record), + &damon_fifo->rx_kfifo_lock); + return; + } +} + +static void damon_mem_sampling_record_cb_register(void) +{ + mem_sampling_record_cb_register(damon_mem_sampling_record_cb); +} + +static void damon_mem_sampling_record_cb_unregister(void) +{ + mem_sampling_record_cb_unregister(damon_mem_sampling_record_cb); +} + +static void set_damon_mem_sampling_state(bool enabled) +{ + if (enabled) { + damon_mem_sampling_record_cb_register(); + static_branch_enable(&mm_damon_mem_sampling); + } else { + damon_mem_sampling_record_cb_unregister(); + static_branch_disable(&mm_damon_mem_sampling); + } +} + +bool damon_use_mem_sampling(void) +{ + return static_branch_unlikely(&mem_sampling_access_hints) && + static_branch_unlikely(&mm_damon_mem_sampling); +} +#else +static inline void set_damon_mem_sampling_state(bool enabled) { } +#endif + +void mem_sampling_process(void) +{ + int i, nr_records; + struct mem_sampling_record *record; + struct mem_sampling_record *record_base; + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + mem_sampling_ops.sampling_decoding(); + + record_base = (struct mem_sampling_record *)mem_sampling_ops.mm_spe_getbuf_addr(); + nr_records = mem_sampling_ops.mm_spe_getnum_record(); + + if (list_empty(&mem_sampling_record_cb_list)) + goto out; + + for (i = 0; i < nr_records; i++) { + record = record_base + i; + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + cb_entry->cb(record); + } + } +out: + /* if mem_sampling_access_hints is set to false, stop sampling */ + if (static_branch_unlikely(&mem_sampling_access_hints)) + mem_sampling_ops.sampling_continue(); + else + mem_sampling_ops.sampling_stop(); +} +EXPORT_SYMBOL_GPL(mem_sampling_process); + +static inline enum mem_sampling_type_enum mem_sampling_get_type(void) +{ +#ifdef CONFIG_ARM_SPE_MEM_SAMPLING + return MEM_SAMPLING_ARM_SPE; +#else + return MEM_SAMPLING_UNSUPPORTED; +#endif +} + +static void __set_mem_sampling_state(bool enabled) +{ + if (enabled) + static_branch_enable(&mem_sampling_access_hints); + else { + static_branch_disable(&mem_sampling_access_hints); + set_numabalancing_mem_sampling_state(enabled); + set_damon_mem_sampling_state(enabled); + } +} + +void set_mem_sampling_state(bool enabled) +{ + if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY) { + mem_sampling_saved_state = enabled ? MEM_SAMPLING_STATE_ENABLE : + MEM_SAMPLING_STATE_DISABLE; + return; + } + + if (!mem_sampling_ops.sampling_start || !mm_spe_enabled()) + return; + if (enabled) + sysctl_mem_sampling_mode = MEM_SAMPLING_NORMAL; + else + sysctl_mem_sampling_mode = MEM_SAMPLING_DISABLED; + __set_mem_sampling_state(enabled); +} + +void mem_sampling_user_switch_process(enum user_switch_type type) +{ + bool state; + int mm_spe_perf_user_count = 0; + int cpu; + + if (type > USER_SWITCH_BACK_TO_MEM_SAMPLING) { + pr_err("user switch type error.\n"); + return; + } + + for_each_possible_cpu(cpu) { + if (per_cpu(arm_spe_user, cpu) == SPE_USER_PERF) + mm_spe_perf_user_count++; + } + + if (type == USER_SWITCH_AWAY_FROM_MEM_SAMPLING) { + /* save state only the status when leave mem_sampling for the first time */ + if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY) + return; + + if (static_branch_unlikely(&mem_sampling_access_hints)) + mem_sampling_saved_state = MEM_SAMPLING_STATE_ENABLE; + else + mem_sampling_saved_state = MEM_SAMPLING_STATE_DISABLE; + + pr_debug("user switch away from mem_sampling, %s is saved, set to disable.\n", + mem_sampling_saved_state ? "disabled" : "enabled"); + + set_mem_sampling_state(false); + } else { + /* If the state is not backed up, do not restore it */ + if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY || mm_spe_perf_user_count) + return; + + state = (mem_sampling_saved_state == MEM_SAMPLING_STATE_ENABLE) ? true : false; + set_mem_sampling_state(state); + mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY; + + pr_debug("user switch back to mem_sampling, set to saved %s.\n", + state ? "enalbe" : "disable"); + } +} +EXPORT_SYMBOL_GPL(mem_sampling_user_switch_process); + +#ifdef CONFIG_PROC_SYSCTL +static int proc_mem_sampling_enable(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = 0; + + if (static_branch_likely(&mem_sampling_access_hints)) + state = 1; + if (static_branch_likely(&sched_numabalancing_mem_sampling)) + state = 2; + if (static_branch_likely(&mm_damon_mem_sampling)) + state = 3; + if (static_branch_likely(&mm_damon_mem_sampling) && + static_branch_likely(&sched_numabalancing_mem_sampling)) + state = 4; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + t.extra1 = (int *)&mem_sampling_min_value; + t.extra2 = (int *)&mem_sampling_max_value; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) { + switch (state) { + case 0: + set_mem_sampling_state(false); + break; + case 1: + set_mem_sampling_state(false); + set_mem_sampling_state(true); + break; + case 2: + set_mem_sampling_state(false); + set_mem_sampling_state(true); + set_numabalancing_mem_sampling_state(true); + break; + case 3: + set_mem_sampling_state(false); + set_mem_sampling_state(true); + set_damon_mem_sampling_state(true); + break; + case 4: + set_mem_sampling_state(true); + set_numabalancing_mem_sampling_state(true); + set_damon_mem_sampling_state(true); + break; + default: + return -EINVAL; + } + } + return err; +} + +static struct ctl_table mem_sampling_sysctls[] = { + { + .procname = "mem_sampling_enable", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_mem_sampling_enable, + .extra1 = SYSCTL_ZERO, + .extra2 = (int *)&mem_sampling_max_value, + }, + {} +}; + +static void __init mem_sampling_sysctl_init(void) +{ + register_sysctl_init("kernel", mem_sampling_sysctls); +} +#else +#define mem_sampling_sysctl_init() do { } while (0) +#endif + +static void __init check_mem_sampling_enable(void) +{ + bool mem_sampling_default = false; + + /* Parsed by setup_mem_sampling. override == 1 enables, -1 disables */ + if (mem_sampling_override) + set_mem_sampling_state(mem_sampling_override == 1); + else + set_mem_sampling_state(mem_sampling_default); +} + +static int __init setup_mem_sampling_enable(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + mem_sampling_override = 1; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse mem_sampling=\n"); + + return ret; +} +__setup("mem_sampling=", setup_mem_sampling_enable); + +static int __init mem_sampling_init(void) +{ + enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type(); + int cpu; + + switch (mem_sampling_type) { + case MEM_SAMPLING_ARM_SPE: + mem_sampling_ops.sampling_start = mm_spe_start; + mem_sampling_ops.sampling_stop = mm_spe_stop; + mem_sampling_ops.sampling_continue = mm_spe_continue; + mem_sampling_ops.sampling_decoding = mm_spe_decoding; + mem_sampling_ops.mm_spe_getbuf_addr = mm_spe_getbuf_addr; + mem_sampling_ops.mm_spe_getnum_record = mm_spe_getnum_record; + + break; + + default: + pr_info("unsupport hardware pmu type(%d), disable access hint!\n", + mem_sampling_type); + set_mem_sampling_state(false); + return -ENODEV; + } + check_mem_sampling_enable(); + mem_sampling_sysctl_init(); + + for_each_possible_cpu(cpu) + per_cpu(arm_spe_user, cpu) = SPE_USER_MEM_SAMPLING; + + return 0; +} +late_initcall(mem_sampling_init); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 219c098b3ffa24287b6fdd2c7b70201fe4b36b07..88f0bb008efd78e619ebe43981b8f6830bfe3913 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1104,6 +1104,46 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return err; } +/* + * Migrate area pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +static int migrate_area_to_node(struct mm_struct *mm, int source, int dest, + unsigned long start, unsigned long end, int flags) +{ + nodemask_t nmask; + struct vm_area_struct *vma; + LIST_HEAD(pagelist); + int err = 0; + struct migration_target_control mtc = { + .nid = dest, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; + + nodes_clear(nmask); + node_set(source, nmask); + + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + vma = find_vma(mm, 0); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + queue_pages_range(mm, start, end, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); + + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_DAMON_DEMOTION, NULL); + if (err) + putback_movable_pages(&pagelist); + } + + return err; +} + + /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. @@ -1209,6 +1249,112 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } +/* + * Move mm area size pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, unsigned long start, + unsigned long end, int flags) +{ + int busy = 0; + int err = 0; + nodemask_t tmp; + + lru_cache_disable(); + + mmap_read_lock(mm); + + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scanning from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from; + while (!nodes_empty(tmp)) { + int s, d; + int source = NUMA_NO_NODE; + int dest = 0; + + for_each_node_mask(s, tmp) { + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) + continue; + + d = node_remap(s, *from, *to); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == NUMA_NO_NODE) + break; + + node_clear(source, tmp); + err = migrate_area_to_node(mm, source, dest, start, end, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } + mmap_read_unlock(mm); + + lru_cache_enable(); + if (err < 0) + return err; + return busy; + +} + /* * Allocate a new page for page migration based on vma policy. * Start by assuming the page is mapped by the same vma as contains @start.