diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 7481b12939e62a7cb3bff111a0a4b35c519ec010..3f9be66edece967c66e0ca25da824174d078030e 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -1228,12 +1228,15 @@ CONFIG_ETMEM_SCAN=m
 CONFIG_ETMEM_SWAP=m
 CONFIG_ETMEM=y
 # CONFIG_BPF_READAHEAD is not set
+CONFIG_MEM_SAMPLING=y
+CONFIG_NUMABALANCING_MEM_SAMPLING=y
 
 #
 # Data Access Monitoring
 #
 CONFIG_DAMON=y
 CONFIG_DAMON_VADDR=y
+CONFIG_DAMON_MEM_SAMPLING=y
 CONFIG_DAMON_PADDR=y
 CONFIG_DAMON_SYSFS=y
 # CONFIG_DAMON_DBGFS is not set
@@ -6969,6 +6972,7 @@ CONFIG_CPU_INSPECTOR_ATF=m
 
 CONFIG_ROH=m
 CONFIG_ROH_HNS=m
+CONFIG_ARM_SPE_MEM_SAMPLING=y
 # end of Device Drivers
 
 #
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 3be1197d872c1d64796f3d454e768b9f0e01f899..2b65435015d7fc2b2299ff10b14707fd839164a3 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -251,4 +251,6 @@ source "drivers/roh/Kconfig"
 
 source "drivers/coda/Kconfig"
 
+source "drivers/arm/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 3955e605df14ffb72cce5203c1d1b60df19d75ee..79d803250002b9459f25e3474a768fd5da2c94c9 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -205,3 +205,5 @@ obj-$(CONFIG_S390)		+= s390/
 obj-$(CONFIG_ROH)		+= roh/
 
 obj-$(CONFIG_HISI_VIRTCCA_CODA)	+= coda/
+
+obj-$(CONFIG_ARM_SPE_MEM_SAMPLING)		+= arm/mm_monitor/
diff --git a/drivers/arm/Kconfig b/drivers/arm/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..d3291f0d5d57793689b233eff4afb3aa3ae48046
--- /dev/null
+++ b/drivers/arm/Kconfig
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+source "drivers/arm/mm_monitor/Kconfig"
diff --git a/drivers/arm/mm_monitor/Kconfig b/drivers/arm/mm_monitor/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..417b403ecffcf1400794cfd134096a4ea21cba15
--- /dev/null
+++ b/drivers/arm/mm_monitor/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# arm spe dirver
+#
+config ARM_SPE_MEM_SAMPLING
+	bool "In-kernel SPE for driver for page access profiling"
+	depends on ARM_SPE_PMU
+	default n
+	help
+	  Enable support for the ARMv8.2 Statistical Profiling Extension (SPE),
+	  which provides periodic sampling of memory accesses and operations
+	  in the CPU pipeline. This extension allows the driver to monitor
+	  memory access patterns, which can help with performance tuning,
+	  debugging, and analyzing memory-related bottlenecks.
+
+	  This feature is only available on ARM64 architecture and will fall
+	  back to the native software sampling mechanism if the ARM SPE PMU
+	  (Performance Monitoring Unit) is in use. When enabled, this
+	  configuration will activate the in-kernel driver to collect profiling
+	  data on page-level memory accesses.
diff --git a/drivers/arm/mm_monitor/Makefile b/drivers/arm/mm_monitor/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9b0b1f18a529b7e6ed7fab13944847316befb807
--- /dev/null
+++ b/drivers/arm/mm_monitor/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ARM_SPE_MEM_SAMPLING) += mm_spe.o spe-decoder/arm-spe-decoder.o spe-decoder/arm-spe-pkt-decoder.o
diff --git a/drivers/arm/mm_monitor/mm_spe.c b/drivers/arm/mm_monitor/mm_spe.c
new file mode 100644
index 0000000000000000000000000000000000000000..0eaa7e7397e1d392287801e1704d5ce998438e51
--- /dev/null
+++ b/drivers/arm/mm_monitor/mm_spe.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm_spe.c: Arm Statistical Profiling Extensions support
+ * Copyright (c) 2019-2020, Arm Ltd.
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+
+#define PMUNAME "mm_spe"
+#define DRVNAME PMUNAME "_driver"
+#define pr_fmt(fmt) DRVNAME ": " fmt
+
+#include <linux/of_device.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/mem_sampling.h>
+#include <trace/events/kmem.h>
+
+#include "spe-decoder/arm-spe-decoder.h"
+#include "spe-decoder/arm-spe-pkt-decoder.h"
+#include "mm_spe.h"
+static bool spe_boost_enable;
+
+static struct mm_spe *spe;
+
+#define SPE_INIT_FAIL	0
+#define SPE_INIT_READY	1
+#define SPE_INIT_SUCC	2
+static int spe_probe_status = SPE_INIT_FAIL;
+
+#define SPE_PMU_FEAT_FILT_EVT		(1UL << 0)
+#define SPE_PMU_FEAT_FILT_TYP		(1UL << 1)
+#define SPE_PMU_FEAT_FILT_LAT		(1UL << 2)
+#define SPE_PMU_FEAT_ARCH_INST		(1UL << 3)
+#define SPE_PMU_FEAT_LDS		(1UL << 4)
+#define SPE_PMU_FEAT_ERND		(1UL << 5)
+#define SPE_PMU_FEAT_INV_FILT_EVT	(1UL << 6)
+#define SPE_PMU_FEAT_DEV_PROBED	(1UL << 63)
+
+DEFINE_PER_CPU(struct mm_spe_buf, per_cpu_spe_buf);
+
+int mm_spe_percpu_buffer_alloc(int cpu)
+{
+	struct mm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu);
+	void *alloc_base;
+
+	if (spe_buf->base && spe_buf->record_base)
+		return 0;
+
+	/* alloc spe raw data buffer */
+	alloc_base = kzalloc_node(SPE_BUFFER_MAX_SIZE, GFP_KERNEL, cpu_to_node(cpu));
+	if (unlikely(!alloc_base)) {
+		pr_err("alloc spe raw data buffer failed.\n");
+		return -ENOMEM;
+	}
+
+	spe_buf->base = alloc_base;
+
+	spe_buf->size = SPE_BUFFER_SIZE;
+	spe_buf->cur = alloc_base + SPE_BUFFER_MAX_SIZE - SPE_BUFFER_SIZE;
+	spe_buf->period = SPE_SAMPLE_PERIOD;
+
+	/* alloc record buffer */
+	spe_buf->record_size = SPE_RECORD_ENTRY_SIZE * SPE_RECORD_BUFFER_MAX_RECORDS;
+	spe_buf->record_base = kzalloc_node(spe_buf->record_size, GFP_KERNEL, cpu_to_node(cpu));
+	if (unlikely(!spe_buf->record_base)) {
+		kfree(alloc_base);
+		pr_err("alloc spe record buffer failed.\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_spe_percpu_buffer_alloc);
+
+int mm_spe_buffer_alloc(void)
+{
+	int cpu, ret = 0;
+	cpumask_t *mask = &spe->supported_cpus;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpumask_test_cpu(cpu, mask))
+			continue;
+		ret = mm_spe_percpu_buffer_alloc(cpu);
+		if (ret)
+			return ret;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_spe_buffer_alloc);
+
+void mm_spe_percpu_buffer_free(int cpu)
+{
+	struct mm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu);
+
+	if (!spe_buf->base)
+		return;
+
+	kfree(spe_buf->base);
+	spe_buf->cur = NULL;
+	spe_buf->base = NULL;
+	spe_buf->size = 0;
+
+	kfree(spe_buf->record_base);
+	spe_buf->record_base = NULL;
+	spe_buf->record_size = 0;
+}
+EXPORT_SYMBOL_GPL(mm_spe_percpu_buffer_free);
+
+void mm_spe_buffer_free(void)
+{
+	cpumask_t *mask = &spe->supported_cpus;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpumask_test_cpu(cpu, mask))
+			continue;
+		mm_spe_percpu_buffer_free(cpu);
+	}
+	spe_probe_status -= 1;
+	set_mem_sampling_state(false);
+}
+EXPORT_SYMBOL_GPL(mm_spe_buffer_free);
+
+static void mm_spe_buffer_init(void)
+{
+	u64 base, limit;
+	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	if (!spe_buf || !spe_buf->cur || !spe_buf->size) {
+		/*
+		 * We still need to clear the limit pointer, since the
+		 * profiler might only be disabled by virtue of a fault.
+		 */
+		limit = 0;
+		goto out_write_limit;
+	}
+
+	base = (u64)spe_buf->cur;
+	limit = ((u64)spe_buf->cur + spe_buf->size) | PMBLIMITR_EL1_E;
+	write_sysreg_s(base, SYS_PMBPTR_EL1);
+
+out_write_limit:
+	write_sysreg_s(limit, SYS_PMBLIMITR_EL1);
+}
+
+void mm_spe_add_probe_status(void)
+{
+	spe_probe_status += 1;
+}
+EXPORT_SYMBOL_GPL(mm_spe_add_probe_status);
+
+static void mm_spe_disable_and_drain_local(void)
+{
+	/* Disable profiling at EL0 and EL1 */
+	write_sysreg_s(0, SYS_PMSCR_EL1);
+	isb();
+
+	/* Drain any buffered data */
+	psb_csync();
+	dsb(nsh);
+
+	/* Disable the profiling buffer */
+	write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+	isb();
+
+	/* Disable boost_spe profiling */
+	if (spe->support_boost_spe && spe_boost_enable) {
+		write_sysreg_s(0, SYS_OMHTPG_EL1);
+		isb();
+	}
+}
+
+static u64 mm_spe_to_pmsfcr(void)
+{
+	u64 reg = 0;
+
+	if (spe->load_filter)
+		reg |= PMSFCR_EL1_LD;
+
+	if (spe->store_filter)
+		reg |= PMSFCR_EL1_ST;
+
+	if (spe->branch_filter)
+		reg |= PMSFCR_EL1_B;
+
+	if (reg)
+		reg |= PMSFCR_EL1_FT;
+
+	if (spe->event_filter)
+		reg |= PMSFCR_EL1_FE;
+
+	if (spe->inv_event_filter)
+		reg |= PMSFCR_EL1_FnE;
+
+	if (spe->min_latency)
+		reg |= PMSFCR_EL1_FL;
+
+	return reg;
+}
+
+static u64 arm_spe_to_htpg(void)
+{
+	u64 reg = 0;
+	struct boost_spe_contol *boost_spe = &spe->boost_spe;
+
+	if (boost_spe->rmt_acc_en)
+		reg |= SYS_OMHTPG_EL1_RMEN;
+
+	if (boost_spe->boost_spe_en_cfg < 0x4)
+		reg |= boost_spe->boost_spe_en_cfg;
+
+	if (boost_spe->record_sel)
+		reg |= SYS_OMHTPG_EL1_REC_SEL;
+
+	if (boost_spe->pop_uop_sel)
+		reg |= SYS_OMHTPG_EL1_POP_UOP_SEL;
+
+	if (boost_spe->sft_cfg < 0x4)
+		reg |= boost_spe->sft_cfg << SYS_OMHTPG_EL1_SFT_CFG_SHIFT;
+
+	if (boost_spe->boost_spe_pa_flt_en || boost_spe->rmt_acc_pa_flt_en) {
+		reg |= 1 < SYS_OMHTPG_EL1_PAEN_SHIFT;
+		reg |= 1 < SYS_OMHTPG_EL1_RMPAFLEN_SHIFT;
+
+		if (boost_spe->pa_flt_pt < 0x8000000 && boost_spe->pa_flt_mask < 0x8000000) {
+			reg |= boost_spe->pa_flt_pt << SYS_OMHTPG_EL1_PAFL_SHIFT;
+			reg |= boost_spe->pa_flt_mask << SYS_OMHTPG_EL1_PAFLMK_SHIFT;
+		}
+	}
+
+	return reg;
+}
+
+static u64 mm_spe_to_pmsevfr(void)
+{
+	return spe->event_filter;
+}
+
+static u64 mm_spe_to_pmsnevfr(void)
+{
+	return spe->inv_event_filter;
+}
+
+static u64 mm_spe_to_pmslatfr(void)
+{
+	return spe->min_latency;
+}
+
+static void mm_spe_sanitise_period(struct mm_spe_buf *spe_buf)
+{
+	u64 period = spe_buf->period;
+	u64 max_period = PMSIRR_EL1_INTERVAL_MASK;
+
+	if (period < spe->min_period)
+		period = spe->min_period;
+	else if (period > max_period)
+		period = max_period;
+	else
+		period &= max_period;
+
+	spe_buf->period = period;
+}
+
+static u64 mm_spe_to_pmsirr(void)
+{
+	u64 reg = 0;
+	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	mm_spe_sanitise_period(spe_buf);
+
+	if (spe->jitter)
+		reg |= 0x1;
+
+	reg |= spe_buf->period << 8;
+
+	return reg;
+}
+
+static u64 mm_spe_to_pmscr(void)
+{
+	u64 reg = 0;
+
+	if (spe->ts_enable)
+		reg |= PMSCR_EL1_TS;
+
+	if (spe->pa_enable)
+		reg |= PMSCR_EL1_PA;
+
+	if (spe->pct_enable < 0x4)
+		reg |= spe->pct_enable << 6;
+
+	if (spe->exclude_user)
+		reg |= PMSCR_EL1_E0SPE;
+
+	if (spe->exclude_kernel)
+		reg |= PMSCR_EL1_E1SPE;
+
+	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
+		reg |= PMSCR_EL1_CX;
+
+	return reg;
+}
+
+int mm_spe_start(void)
+{
+	u64 reg;
+	int cpu = smp_processor_id();
+
+	if (!cpumask_test_cpu(cpu, &spe->supported_cpus))
+		return -ENOENT;
+
+	mm_spe_buffer_init();
+
+	reg = mm_spe_to_pmsfcr();
+	write_sysreg_s(reg, SYS_PMSFCR_EL1);
+
+	reg = mm_spe_to_pmsevfr();
+	write_sysreg_s(reg, SYS_PMSEVFR_EL1);
+
+	if (spe->features & SPE_PMU_FEAT_INV_FILT_EVT) {
+		reg = mm_spe_to_pmsnevfr();
+		write_sysreg_s(reg, SYS_PMSNEVFR_EL1);
+	}
+
+	reg = mm_spe_to_pmslatfr();
+
+	write_sysreg_s(reg, SYS_PMSLATFR_EL1);
+
+	reg = mm_spe_to_pmsirr();
+	write_sysreg_s(reg, SYS_PMSIRR_EL1);
+	isb();
+
+	reg = mm_spe_to_pmscr();
+	isb();
+	write_sysreg_s(reg, SYS_PMSCR_EL1);
+
+	if (spe->support_boost_spe) {
+		reg = arm_spe_to_htpg();
+		isb();
+		write_sysreg_s(reg, SYS_OMHTPG_EL1);
+	}
+
+	return 0;
+}
+
+void mm_spe_continue(void)
+{
+	int reg;
+
+	mm_spe_buffer_init();
+
+	reg = mm_spe_to_pmscr();
+
+	isb();
+	write_sysreg_s(reg, SYS_PMSCR_EL1);
+}
+
+void mm_spe_stop(void)
+{
+	mm_spe_disable_and_drain_local();
+}
+
+void mm_spe_decoding(void)
+{
+	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	spe_buf->nr_records = 0;
+	arm_spe_decode_buf(spe_buf->cur, spe_buf->size);
+}
+
+struct mm_spe_buf *mm_spe_getbuf_addr(void)
+{
+	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	return spe_buf;
+}
+
+int mm_spe_getnum_record(void)
+{
+	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	return spe_buf->nr_records;
+}
+
+struct mm_spe *mm_spe_get_desc(void)
+{
+	return spe;
+}
+EXPORT_SYMBOL_GPL(mm_spe_get_desc);
+
+int mm_spe_enabled(void)
+{
+	return spe_probe_status == SPE_INIT_SUCC;
+}
+
+static const struct of_device_id mm_spe_sample_para_init_tb[] = {
+	{ .compatible = "arm,statistical-profiling-extension-v1",
+	  .data = (void *)1 },
+	{ /* Sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, mm_spe_sample_para_init_tb);
+
+static const struct platform_device_id mm_spe_match[] = {
+	{ ARMV8_SPE_MEM_SAMPLING_PDEV_NAME, 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(platform, mm_spe_match);
+
+static void arm_spe_boost_spe_para_init(void)
+{
+	struct boost_spe_contol *boost_spe = &spe->boost_spe;
+
+	boost_spe->record_sel = 1;
+	boost_spe->pop_uop_sel = 0;
+	boost_spe->rmt_acc_pa_flt_en = 0;
+	boost_spe->rmt_acc_en = 1;
+	boost_spe->boost_spe_pa_flt_en = 0;
+	boost_spe->pa_flt_pt = 0;
+	boost_spe->pa_flt_mask = 0;
+	boost_spe->sft_cfg = 0;
+	boost_spe->boost_spe_en_cfg = 0x3;
+}
+
+static void mm_spe_sample_para_init(void)
+{
+	u64 implementor = read_cpuid_implementor();
+	u64 part_num = read_cpuid_part_number();
+
+	/* Is support boost_spe sampling? */
+	if (implementor == ARM_CPU_IMP_HISI && part_num == 0xd06)
+		spe->support_boost_spe = true;
+
+	spe->sample_period = SPE_SAMPLE_PERIOD;
+	spe->jitter = 1;
+	spe->load_filter = 1;
+	spe->store_filter = 1;
+	spe->branch_filter = 0;
+	spe->inv_event_filter = 0;
+	spe->event_filter = 0x2;
+
+	spe->ts_enable = 0;
+	spe->pa_enable = 1;
+	spe->pct_enable = 0;
+
+	spe->exclude_user = 1;
+	spe->exclude_kernel = 0;
+
+	spe->min_latency = 120;
+
+	if (spe->support_boost_spe)
+		arm_spe_boost_spe_para_init();
+}
+
+void mm_spe_record_enqueue(struct arm_spe_record *record)
+{
+	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+	struct mem_sampling_record *record_tail;
+
+	if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) {
+		pr_err("nr_records exceeded!\n");
+		return;
+	}
+
+	if (record->boost_spe_idx)
+		trace_spe_boost_spe_record((struct mem_sampling_record *)record);
+	trace_mm_spe_record((struct mem_sampling_record *)record);
+	record_tail = spe_buf->record_base +
+			spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE;
+	*record_tail = *(struct mem_sampling_record *)record;
+	spe_buf->nr_records++;
+}
+
+static int mm_spe_device_probe(struct platform_device *pdev)
+{
+
+	struct device *dev;
+
+	/*
+	 * If kernelspace is unmapped when running at EL0, then the SPE
+	 * buffer will fault and prematurely terminate the AUX session.
+	 */
+	if (arm64_kernel_unmapped_at_el0()) {
+		dev_warn_once(dev, "buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n");
+		return -EPERM;
+	}
+
+	if (!pdev) {
+		pr_err("pdev is NULL!\n");
+		return -ENODEV;
+	}
+
+	dev = &pdev->dev;
+	if (!dev) {
+		pr_err("dev is NULL!\n");
+		return -ENODEV;
+	}
+
+	spe = devm_kzalloc(dev, sizeof(*spe), GFP_KERNEL);
+	if (!spe)
+		return -ENOMEM;
+
+	spe->pdev = pdev;
+	platform_set_drvdata(pdev, spe);
+
+	mm_spe_sample_para_init();
+
+	mm_spe_add_probe_status();
+	return 0;
+
+}
+
+static struct platform_driver mm_spe_driver = {
+	.id_table = mm_spe_match,
+	.driver	= {
+		.name		= DRVNAME,
+		.of_match_table	= of_match_ptr(mm_spe_sample_para_init_tb),
+		.suppress_bind_attrs = true,
+	},
+	.probe	= mm_spe_device_probe,
+};
+
+static __init int enable_spe_boost(char *str)
+{
+	spe_boost_enable = true;
+	return 0;
+}
+early_param("enable_spe_boost", enable_spe_boost);
+
+static int __init mm_spe_init(void)
+{
+	return platform_driver_register(&mm_spe_driver);
+}
+
+static void __exit arm_spe_exit(void)
+{
+	platform_driver_unregister(&mm_spe_driver);
+}
+
+subsys_initcall(mm_spe_init);
diff --git a/drivers/arm/mm_monitor/mm_spe.h b/drivers/arm/mm_monitor/mm_spe.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ffc11cb951a1a784291bf1f03be494edbd8f4fa
--- /dev/null
+++ b/drivers/arm/mm_monitor/mm_spe.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SPE_H
+#define __SPE_H
+
+#define SPE_BUFFER_MAX_SIZE		(PAGE_SIZE)
+#define SPE_BUFFER_SIZE			(PAGE_SIZE / 16)
+
+#define SPE_SAMPLE_PERIOD		1024
+
+#define SPE_RECORD_BUFFER_MAX_RECORDS	(100)
+#define SPE_RECORD_ENTRY_SIZE		sizeof(struct mem_sampling_record)
+#define ARMV8_SPE_MEM_SAMPLING_PDEV_NAME "arm,mm_spe,spe-v1"
+
+/* boost_spe sampling controls */
+#define SYS_OMHTPG_EL1			sys_reg(3, 0, 15, 8, 2)
+#define SYS_OMHTPG_EL1_RMCF_SHIFT	0
+#define SYS_OMHTPG_EL1_RMCF_MASK	0x3UL
+#define SYS_OMHTPG_EL1_RMEN		GENMASK(2, 2)
+#define SYS_OMHTPG_EL1_RMEN_SHIFT	2
+#define SYS_OMHTPG_EL1_PAFL		GENMASK(3, 3)
+#define SYS_OMHTPG_EL1_PAFL_SHIFT	3
+#define SYS_OMHTPG_EL1_PAFL_MASK	0x7FFFFFFUL
+#define SYS_OMHTPG_EL1_PAFLMK_SHIFT	30
+#define SYS_OMHTPG_EL1_PAFLMK_MASK	0x7FFFFFFUL
+#define SYS_OMHTPG_EL1_PAEN_SHIFT	57
+
+#define SYS_OMHTPG_EL1_RMPAFLEN_SHIFT	58
+#define SYS_OMHTPG_EL1_POP_UOP_SEL	GENMASK(59, 59)
+#define SYS_OMHTPG_EL1_SFT_CFG_SHIFT	60
+#define SYS_OMHTPG_EL1_SFT_CFG_MASK	0x3UL
+#define SYS_OMHTPG_EL1_REC_SEL		GENMASK(62, 62)
+
+struct boost_spe_contol {
+	u32				boost_spe_en_cfg;
+	u32				pa_flt_pt;
+	u32				pa_flt_mask;
+	u64				sft_cfg;
+	bool				boost_spe_pa_flt_en;
+	bool				rmt_acc_en;
+	bool				rmt_acc_pa_flt_en;
+	bool				pop_uop_sel;
+	bool				record_sel;
+};
+
+struct mm_spe {
+	struct pmu			pmu;
+	struct platform_device		*pdev;
+	cpumask_t			supported_cpus;
+	struct hlist_node		hotplug_node;
+	struct boost_spe_contol		boost_spe;
+	int				irq; /* PPI */
+	u16				pmsver;
+	u16				min_period;
+	u16				counter_sz;
+	u64				features;
+	u16				max_record_sz;
+	u16				align;
+	u64				sample_period;
+	local64_t			period_left;
+	bool				jitter;
+	bool				load_filter;
+	bool				store_filter;
+	bool				branch_filter;
+	u64				inv_event_filter;
+	u16				min_latency;
+	u64				event_filter;
+	bool				ts_enable;
+	bool				pa_enable;
+	u8				pct_enable;
+	bool				exclude_user;
+	bool				exclude_kernel;
+	bool				support_boost_spe;
+};
+
+struct mm_spe_buf {
+	void				*cur;		/* for spe raw data buffer */
+	int				size;
+	int				period;
+	void				*base;
+
+	void				*record_base;	/* for spe record buffer */
+	int				record_size;
+	int				nr_records;
+};
+
+#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
+void mm_spe_add_probe_status(void);
+int mm_spe_percpu_buffer_alloc(int cpu);
+int mm_spe_buffer_alloc(void);
+void mm_spe_percpu_buffer_free(int cpu);
+void mm_spe_buffer_free(void);
+struct mm_spe *mm_spe_get_desc(void);
+#else
+static inline void mm_spe_add_probe_status(void) { }
+static inline int mm_spe_percpu_buffer_alloc(int cpu) { return 0; }
+static inline int mm_spe_buffer_alloc(void) { return 0; }
+static inline void mm_spe_percpu_buffer_free(int cpu) { }
+static inline void mm_spe_buffer_free(void) { }
+static inline struct mm_spe *mm_spe_get_desc(void) { return NULL; }
+#endif
+#endif /* __SPE_H */
diff --git a/drivers/arm/mm_monitor/spe-decoder/Makefile b/drivers/arm/mm_monitor/spe-decoder/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4fdae5d381867542ad12a7a7d34aabfdd141e40b
--- /dev/null
+++ b/drivers/arm/mm_monitor/spe-decoder/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y := arm-spe-decoder.o arm-spe-pkt-decoder.o
diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.c b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.c
new file mode 100644
index 0000000000000000000000000000000000000000..1394d377c061946de96f469415677c0a32af88b7
--- /dev/null
+++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arm_spe_decoder.c: ARM SPE support
+ * Copyright (c) 2017-2018, Arm Ltd.
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+
+#include "arm-spe-decoder.h"
+
+static u64 arm_spe_calc_ip(int index, u64 payload)
+{
+	u64 ns, el, val;
+	u32 seen_idx;
+
+	/* Instruction virtual address or Branch target address */
+	if (index == SPE_ADDR_PKT_HDR_INDEX_INS ||
+	    index == SPE_ADDR_PKT_HDR_INDEX_BRANCH) {
+		ns = SPE_ADDR_PKT_GET_NS(payload);
+		el = SPE_ADDR_PKT_GET_EL(payload);
+
+		/* Clean highest byte */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+
+		/* Fill highest byte for EL1 or EL2 (VHE) mode */
+		if (ns && (el == SPE_ADDR_PKT_EL1 || el == SPE_ADDR_PKT_EL2))
+			payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT;
+
+	/* Data access virtual address */
+	} else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT) {
+
+		/* Clean tags */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+
+		/*
+		 * Armv8 ARM (ARM DDI 0487F.c), chapter "D10.2.1 Address packet"
+		 * defines the data virtual address payload format, the top byte
+		 * (bits [63:56]) is assigned as top-byte tag; so we only can
+		 * retrieve address value from bits [55:0].
+		 *
+		 * According to Documentation/arm64/memory.rst, if detects the
+		 * specific pattern in bits [55:52] of payload which falls in
+		 * the kernel space, should fixup the top byte and this allows
+		 * perf tool to parse DSO symbol for data address correctly.
+		 *
+		 * For this reason, if detects the bits [55:52] is 0xf, will
+		 * fill 0xff into the top byte.
+		 */
+		val = SPE_ADDR_PKT_ADDR_GET_BYTE_6(payload);
+		if ((val & 0xf0ULL) == 0xf0ULL)
+			payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT;
+
+	/* Data access physical address */
+	} else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) {
+		/* Clean highest byte */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+	/* Boost_spe hot data access physical address */
+	} else if (index == SPE_ADDR_PKT_HDR_INDEX_BOOST_SPE_DATA_PHYS) {
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_BOOST_SPE(payload);
+	/* Remote Data access physical address */
+	} else if (index == SPE_ADDR_PKT_HDR_INDEX_REMOTE_DATA_PHYS) {
+		/* Clean highest byte */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+	} else {
+		seen_idx = 0;
+		if (!(seen_idx & BIT(index))) {
+			seen_idx |= BIT(index);
+			pr_warn("ignoring unsupported address packet index: 0x%x\n", index);
+		}
+	}
+
+	return payload;
+}
+
+void arm_spe_decoder_free(struct arm_spe_decoder *decoder)
+{
+	kfree(decoder);
+}
+
+static int arm_spe_get_next_packet(struct arm_spe_decoder *decoder)
+{
+	int ret;
+
+	do {
+		if (!decoder->len)
+			return 0;
+
+		ret = arm_spe_get_packet(decoder->buf, decoder->len,
+					 &decoder->packet);
+		if (ret <= 0) {
+			/* Move forward for 1 byte */
+			decoder->buf += 1;
+			decoder->len -= 1;
+			return -EBADMSG;
+		}
+
+		decoder->buf += ret;
+		decoder->len -= ret;
+	} while (decoder->packet.type == ARM_SPE_PAD);
+	return 1;
+}
+
+static int arm_spe_read_record(struct arm_spe_decoder *decoder)
+{
+	int err;
+	int idx;
+	u64 payload, ip;
+
+	memset(&decoder->record, 0x0, sizeof(decoder->record));
+	decoder->record.context_id = (u64)-1;
+	while (1) {
+		err = arm_spe_get_next_packet(decoder);
+		if (err <= 0)
+			return err;
+
+		idx = decoder->packet.index;
+		payload = decoder->packet.payload;
+
+		switch (decoder->packet.type) {
+		case ARM_SPE_TIMESTAMP:
+			decoder->record.timestamp = payload;
+			return 1;
+		case ARM_SPE_END:
+			return 1;
+		case ARM_SPE_ADDRESS:
+			ip = arm_spe_calc_ip(idx, payload);
+			if (idx == SPE_ADDR_PKT_HDR_INDEX_INS)
+				decoder->record.from_ip = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_BRANCH)
+				decoder->record.to_ip = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT)
+				decoder->record.virt_addr = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS)
+				decoder->record.phys_addr = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_BOOST_SPE_DATA_PHYS)
+				decoder->record.boost_spe_addr[decoder->record.boost_spe_idx++]
+											= ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_REMOTE_DATA_PHYS)
+				decoder->record.remote_addr = ip;
+			break;
+		case ARM_SPE_COUNTER:
+			if (idx == SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT)
+				decoder->record.latency = payload;
+			break;
+		case ARM_SPE_CONTEXT:
+			decoder->record.context_id = payload;
+			break;
+		case ARM_SPE_OP_TYPE:
+			if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
+				if (payload & 0x1)
+					decoder->record.op = ARM_SPE_ST;
+				else
+					decoder->record.op = ARM_SPE_LD;
+			}
+			break;
+		case ARM_SPE_EVENTS:
+			if (payload & BIT(EV_L1D_REFILL))
+				decoder->record.type |= ARM_SPE_L1D_MISS;
+
+			if (payload & BIT(EV_L1D_ACCESS))
+				decoder->record.type |= ARM_SPE_L1D_ACCESS;
+
+			if (payload & BIT(EV_TLB_WALK))
+				decoder->record.type |= ARM_SPE_TLB_MISS;
+
+			if (payload & BIT(EV_TLB_ACCESS))
+				decoder->record.type |= ARM_SPE_TLB_ACCESS;
+
+			if (payload & BIT(EV_LLC_MISS))
+				decoder->record.type |= ARM_SPE_LLC_MISS;
+
+			if (payload & BIT(EV_LLC_ACCESS))
+				decoder->record.type |= ARM_SPE_LLC_ACCESS;
+
+			if (payload & BIT(EV_REMOTE_ACCESS))
+				decoder->record.type |= ARM_SPE_REMOTE_ACCESS;
+
+			if (payload & BIT(EV_MISPRED))
+				decoder->record.type |= ARM_SPE_BRANCH_MISS;
+
+			break;
+		case ARM_SPE_DATA_SOURCE:
+			decoder->record.source = payload;
+			break;
+		case ARM_SPE_BAD:
+			break;
+		case ARM_SPE_PAD:
+			break;
+		default:
+			pr_err("Get packet error!\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static bool arm_spe_decode(struct arm_spe_decoder *decoder)
+{
+	if (decoder->len) {
+		if (arm_spe_read_record(decoder) == 1)
+			return true;
+	}
+	return false;
+}
+
+void arm_spe_decode_buf(const unsigned char *buf, size_t len)
+{
+	struct arm_spe_decoder decoder;
+
+	decoder.buf = buf;
+	decoder.len = len;
+
+	while (arm_spe_decode(&decoder))
+		mm_spe_record_enqueue(&(decoder.record));
+
+}
+EXPORT_SYMBOL(arm_spe_decode_buf);
diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.h b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ccc32de8afc4c2b6f4fde294e828ca1c44a0b40
--- /dev/null
+++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-decoder.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arm_spe_decoder.h: Arm Statistical Profiling Extensions support
+ * Copyright (c) 2019-2020, Arm Ltd.
+ */
+
+#ifndef INCLUDE__ARM_SPE_DECODER_H__
+#define INCLUDE__ARM_SPE_DECODER_H__
+
+#include <linux/stddef.h>
+
+#include "arm-spe-pkt-decoder.h"
+
+enum arm_spe_sample_type {
+	ARM_SPE_L1D_ACCESS		= 1 << 0,
+	ARM_SPE_L1D_MISS		= 1 << 1,
+	ARM_SPE_LLC_ACCESS		= 1 << 2,
+	ARM_SPE_LLC_MISS		= 1 << 3,
+	ARM_SPE_TLB_ACCESS		= 1 << 4,
+	ARM_SPE_TLB_MISS		= 1 << 5,
+	ARM_SPE_BRANCH_MISS		= 1 << 6,
+	ARM_SPE_REMOTE_ACCESS		= 1 << 7,
+};
+
+enum arm_spe_op_type {
+	ARM_SPE_LD			= 1 << 0,
+	ARM_SPE_ST			= 1 << 1,
+};
+
+enum arm_spe_neoverse_data_source {
+	ARM_SPE_NV_L1D			= 0x0,
+	ARM_SPE_NV_L2			= 0x8,
+	ARM_SPE_NV_PEER_CORE		= 0x9,
+	ARM_SPE_NV_LOCAL_CLUSTER	= 0xa,
+	ARM_SPE_NV_SYS_CACHE		= 0xb,
+	ARM_SPE_NV_PEER_CLUSTER		= 0xc,
+	ARM_SPE_NV_REMOTE		= 0xd,
+	ARM_SPE_NV_DRAM			= 0xe,
+};
+
+struct arm_spe_record {
+	enum arm_spe_sample_type	type;
+	int				err;
+	u32				op;
+	u32				latency;
+	u64				from_ip;
+	u64				to_ip;
+	u64				timestamp;
+	u64				virt_addr;
+	u64				phys_addr;
+	u64				context_id;
+	u64				boost_spe_addr[8];
+	u64				remote_addr;
+	u16				boost_spe_idx;
+	u16				source;
+};
+
+struct arm_spe_buffer {
+	const unsigned char		*buf;
+	size_t				len;
+	u64				offset;
+	u64				trace_nr;
+};
+
+struct arm_spe_decoder {
+	struct arm_spe_record		record;
+	const unsigned char		*buf;
+	size_t				len;
+	struct arm_spe_pkt		packet;
+};
+
+void arm_spe_decoder_free(struct arm_spe_decoder *decoder);
+void arm_spe_decode_buf(const unsigned char *buf, size_t len);
+void mm_spe_record_enqueue(struct arm_spe_record *record);
+#endif
diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.c b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.c
new file mode 100644
index 0000000000000000000000000000000000000000..aeec434487798475c7899cce9f91e8fb0a6f272e
--- /dev/null
+++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Arm Statistical Profiling Extensions (SPE) support
+ * Copyright (c) 2017-2018, Arm Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+
+#include "arm-spe-pkt-decoder.h"
+
+/*
+ * Extracts the field "sz" from header bits and converts to bytes:
+ *   00 : byte (1)
+ *   01 : halfword (2)
+ *   10 : word (4)
+ *   11 : doubleword (8)
+ */
+static unsigned int arm_spe_payload_len(unsigned char hdr)
+{
+	return 1U << ((hdr & GENMASK_ULL(5, 4)) >> 4);
+}
+
+static int arm_spe_get_payload(const unsigned char *buf, size_t len,
+			       unsigned char ext_hdr,
+			       struct arm_spe_pkt *packet)
+{
+	size_t payload_len = arm_spe_payload_len(buf[ext_hdr]);
+
+	if (len < 1 + ext_hdr + payload_len)
+		return ARM_SPE_NEED_MORE_BYTES;
+
+	buf += 1 + ext_hdr;
+
+	switch (payload_len) {
+	case 1:
+		packet->payload = *(uint8_t *)buf;
+		break;
+	case 2:
+		packet->payload = le16_to_cpu(*(uint16_t *)buf);
+		break;
+	case 4:
+		packet->payload = le32_to_cpu(*(uint32_t *)buf);
+		break;
+	case 8:
+		packet->payload = le64_to_cpu(*(uint64_t *)buf);
+		break;
+	default:
+		return ARM_SPE_BAD_PACKET;
+	}
+
+	return 1 + ext_hdr + payload_len;
+}
+
+static int arm_spe_get_pad(struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_PAD;
+	return 1;
+}
+
+static int arm_spe_get_alignment(const unsigned char *buf, size_t len,
+				 struct arm_spe_pkt *packet)
+{
+	unsigned int alignment = 1 << ((buf[0] & 0xf) + 1);
+
+	if (len < alignment)
+		return ARM_SPE_NEED_MORE_BYTES;
+
+	packet->type = ARM_SPE_PAD;
+	return alignment - (((uintptr_t)buf) & (alignment - 1));
+}
+
+static int arm_spe_get_end(struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_END;
+	return 1;
+}
+
+static int arm_spe_get_timestamp(const unsigned char *buf, size_t len,
+				 struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_TIMESTAMP;
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_events(const unsigned char *buf, size_t len,
+			      struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_EVENTS;
+
+	/* we use index to identify Events with a less number of
+	 * comparisons in arm_spe_pkt_desc(): E.g., the LLC-ACCESS,
+	 * LLC-REFILL, and REMOTE-ACCESS events are identified if
+	 * index > 1.
+	 */
+	packet->index = arm_spe_payload_len(buf[0]);
+
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_data_source(const unsigned char *buf, size_t len,
+				   struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_DATA_SOURCE;
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_context(const unsigned char *buf, size_t len,
+			       struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_CONTEXT;
+	packet->index = SPE_CTX_PKT_HDR_INDEX(buf[0]);
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_op_type(const unsigned char *buf, size_t len,
+			       struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_OP_TYPE;
+	packet->index = SPE_OP_PKT_HDR_CLASS(buf[0]);
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_counter(const unsigned char *buf, size_t len,
+			       const unsigned char ext_hdr, struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_COUNTER;
+
+	if (ext_hdr)
+		packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]);
+	else
+		packet->index = SPE_HDR_SHORT_INDEX(buf[0]);
+
+	return arm_spe_get_payload(buf, len, ext_hdr, packet);
+}
+
+static int arm_spe_get_addr(const unsigned char *buf, size_t len,
+			    const unsigned char ext_hdr, struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_ADDRESS;
+
+	if (ext_hdr)
+		packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]);
+	else
+		packet->index = SPE_HDR_SHORT_INDEX(buf[0]);
+
+	return arm_spe_get_payload(buf, len, ext_hdr, packet);
+}
+
+static int arm_spe_do_get_packet(const unsigned char *buf, size_t len,
+				 struct arm_spe_pkt *packet)
+{
+	unsigned int hdr;
+	unsigned char ext_hdr = 0;
+
+	memset(packet, 0, sizeof(struct arm_spe_pkt));
+
+	if (!len)
+		return ARM_SPE_NEED_MORE_BYTES;
+
+	hdr = buf[0];
+
+	if (hdr == SPE_HEADER0_PAD)
+		return arm_spe_get_pad(packet);
+
+	if (hdr == SPE_HEADER0_END) /* no timestamp at end of record */
+		return arm_spe_get_end(packet);
+
+	if (hdr == SPE_HEADER0_TIMESTAMP)
+		return arm_spe_get_timestamp(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_EVENTS)
+		return arm_spe_get_events(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_SOURCE)
+		return arm_spe_get_data_source(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_CONTEXT)
+		return arm_spe_get_context(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_OP_TYPE)
+		return arm_spe_get_op_type(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_EXTENDED) {
+		/* 16-bit extended format header */
+		if (len == 1)
+			return ARM_SPE_BAD_PACKET;
+
+		ext_hdr = 1;
+		hdr = buf[1];
+		if (hdr == SPE_HEADER1_ALIGNMENT)
+			return arm_spe_get_alignment(buf, len, packet);
+	}
+
+	/*
+	 * The short format header's byte 0 or the extended format header's
+	 * byte 1 has been assigned to 'hdr', which uses the same encoding for
+	 * address packet and counter packet, so don't need to distinguish if
+	 * it's short format or extended format and handle in once.
+	 */
+	if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_ADDRESS)
+		return arm_spe_get_addr(buf, len, ext_hdr, packet);
+
+	if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_COUNTER)
+		return arm_spe_get_counter(buf, len, ext_hdr, packet);
+
+	return ARM_SPE_BAD_PACKET;
+}
+
+int arm_spe_get_packet(const unsigned char *buf, size_t len,
+		       struct arm_spe_pkt *packet)
+{
+	int ret;
+
+	ret = arm_spe_do_get_packet(buf, len, packet);
+	/* put multiple consecutive PADs on the same line, up to
+	 * the fixed-width output format of 16 bytes per line.
+	 */
+	if (ret > 0 && packet->type == ARM_SPE_PAD) {
+		while (ret < 16 && len > (size_t)ret && !buf[ret])
+			ret += 1;
+	}
+	return ret;
+}
diff --git a/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.h b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..873c3590e4a8769ab587cfe7580f8a3d3e69e6b7
--- /dev/null
+++ b/drivers/arm/mm_monitor/spe-decoder/arm-spe-pkt-decoder.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Arm Statistical Profiling Extensions (SPE) support
+ * Copyright (c) 2017-2018, Arm Ltd.
+ */
+
+#ifndef INCLUDE__ARM_SPE_PKT_DECODER_H__
+#define INCLUDE__ARM_SPE_PKT_DECODER_H__
+
+#include <linux/stddef.h>
+
+#define ARM_SPE_PKT_DESC_MAX			256
+#define ARM_SPE_NEED_MORE_BYTES		-1
+#define ARM_SPE_BAD_PACKET			-2
+#define ARM_SPE_PKT_MAX_SZ			16
+
+enum arm_spe_pkt_type {
+	ARM_SPE_BAD,
+	ARM_SPE_PAD,
+	ARM_SPE_END,
+	ARM_SPE_TIMESTAMP,
+	ARM_SPE_ADDRESS,
+	ARM_SPE_COUNTER,
+	ARM_SPE_CONTEXT,
+	ARM_SPE_OP_TYPE,
+	ARM_SPE_EVENTS,
+	ARM_SPE_DATA_SOURCE,
+};
+
+struct arm_spe_pkt {
+	enum arm_spe_pkt_type			type;
+	unsigned char				index;
+	uint64_t				payload;
+};
+
+/* Short header (HEADER0) and extended header (HEADER1) */
+#define SPE_HEADER0_PAD			0x0
+#define SPE_HEADER0_END			0x1
+#define SPE_HEADER0_TIMESTAMP			0x71
+/* Mask for event & data source */
+#define SPE_HEADER0_MASK1			(GENMASK_ULL(7, 6) | GENMASK_ULL(3, 0))
+#define SPE_HEADER0_EVENTS			0x42
+#define SPE_HEADER0_SOURCE			0x43
+/* Mask for context & operation */
+#define SPE_HEADER0_MASK2			GENMASK_ULL(7, 2)
+#define SPE_HEADER0_CONTEXT			0x64
+#define SPE_HEADER0_OP_TYPE			0x48
+/* Mask for extended format */
+#define SPE_HEADER0_EXTENDED			0x20
+/* Mask for address & counter */
+#define SPE_HEADER0_MASK3			GENMASK_ULL(7, 3)
+#define SPE_HEADER0_ADDRESS			0xb0
+#define SPE_HEADER0_COUNTER			0x98
+#define SPE_HEADER1_ALIGNMENT			0x0
+
+#define SPE_HDR_SHORT_INDEX(h)			((h) & GENMASK_ULL(2, 0))
+#define SPE_HDR_EXTENDED_INDEX(h0, h1)		(((h0) & GENMASK_ULL(1, 0)) << 3 | \
+						SPE_HDR_SHORT_INDEX(h1))
+
+/* Address packet header */
+#define SPE_ADDR_PKT_HDR_INDEX_INS		0x0
+#define SPE_ADDR_PKT_HDR_INDEX_BRANCH		0x1
+#define SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT	0x2
+#define SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS	0x3
+#define SPE_ADDR_PKT_HDR_INDEX_PREV_BRANCH	0x4
+#define SPE_ADDR_PKT_HDR_INDEX_BOOST_SPE_DATA_PHYS	0x6
+#define SPE_ADDR_PKT_HDR_INDEX_REMOTE_DATA_PHYS	0x7
+
+/* Address packet payload */
+#define SPE_ADDR_PKT_ADDR_BYTE7_SHIFT		56
+#define SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(v)	((v) & GENMASK_ULL(55, 0))
+#define SPE_ADDR_PKT_ADDR_GET_BYTE_6(v)	(((v) & GENMASK_ULL(55, 48)) >> 48)
+
+#define SPE_ADDR_PKT_GET_NS(v)			(((v) & BIT_ULL(63)) >> 63)
+#define SPE_ADDR_PKT_GET_EL(v)			(((v) & GENMASK_ULL(62, 61)) >> 61)
+#define SPE_ADDR_PKT_GET_CH(v)			(((v) & BIT_ULL(62)) >> 62)
+#define SPE_ADDR_PKT_GET_PAT(v)		(((v) & GENMASK_ULL(59, 56)) >> 56)
+
+#define SPE_ADDR_PKT_EL0			0
+#define SPE_ADDR_PKT_EL1			1
+#define SPE_ADDR_PKT_EL2			2
+#define SPE_ADDR_PKT_EL3			3
+
+/* Boost_spe address packet payload */
+#define SPE_ADDR_PKT_ADDR_GET_BYTES_BOOST_SPE(v)	((v) & GENMASK_ULL(52, 12))
+
+/* Context packet header */
+#define SPE_CTX_PKT_HDR_INDEX(h)		((h) & GENMASK_ULL(1, 0))
+
+/* Counter packet header */
+#define SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT	0x0
+#define SPE_CNT_PKT_HDR_INDEX_ISSUE_LAT	0x1
+#define SPE_CNT_PKT_HDR_INDEX_TRANS_LAT	0x2
+
+/* Event packet payload */
+enum arm_spe_events {
+	EV_EXCEPTION_GEN			= 0,
+	EV_RETIRED				= 1,
+	EV_L1D_ACCESS				= 2,
+	EV_L1D_REFILL				= 3,
+	EV_TLB_ACCESS				= 4,
+	EV_TLB_WALK				= 5,
+	EV_NOT_TAKEN				= 6,
+	EV_MISPRED				= 7,
+	EV_LLC_ACCESS				= 8,
+	EV_LLC_MISS				= 9,
+	EV_REMOTE_ACCESS			= 10,
+	EV_ALIGNMENT				= 11,
+	EV_PARTIAL_PREDICATE			= 17,
+	EV_EMPTY_PREDICATE			= 18,
+};
+
+/* Operation packet header */
+#define SPE_OP_PKT_HDR_CLASS(h)		((h) & GENMASK_ULL(1, 0))
+#define SPE_OP_PKT_HDR_CLASS_OTHER		0x0
+#define SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC	0x1
+#define SPE_OP_PKT_HDR_CLASS_BR_ERET		0x2
+
+#define SPE_OP_PKT_IS_OTHER_SVE_OP(v)		(((v) & (BIT(7) | BIT(3) | BIT(0))) == 0x8)
+
+#define SPE_OP_PKT_COND			BIT(0)
+
+#define SPE_OP_PKT_LDST_SUBCLASS_GET(v)	((v) & GENMASK_ULL(7, 1))
+#define SPE_OP_PKT_LDST_SUBCLASS_GP_REG	0x0
+#define SPE_OP_PKT_LDST_SUBCLASS_SIMD_FP	0x4
+#define SPE_OP_PKT_LDST_SUBCLASS_UNSPEC_REG	0x10
+#define SPE_OP_PKT_LDST_SUBCLASS_NV_SYSREG	0x30
+
+#define SPE_OP_PKT_IS_LDST_ATOMIC(v)		(((v) & (GENMASK_ULL(7, 5) | BIT(1))) == 0x2)
+
+#define SPE_OP_PKT_AR				BIT(4)
+#define SPE_OP_PKT_EXCL			BIT(3)
+#define SPE_OP_PKT_AT				BIT(2)
+#define SPE_OP_PKT_ST				BIT(0)
+
+#define SPE_OP_PKT_IS_LDST_SVE(v)		(((v) & (BIT(3) | BIT(1))) == 0x8)
+
+#define SPE_OP_PKT_SVE_SG			BIT(7)
+/*
+ * SVE effective vector length (EVL) is stored in byte 0 bits [6:4];
+ * the length is rounded up to a power of two and use 32 as one step,
+ * so EVL calculation is:
+ *
+ *   32 * (2 ^ bits [6:4]) = 32 << (bits [6:4])
+ */
+#define SPE_OP_PKG_SVE_EVL(v)			(32 << (((v) & GENMASK_ULL(6, 4)) >> 4))
+#define SPE_OP_PKT_SVE_PRED			BIT(2)
+#define SPE_OP_PKT_SVE_FP			BIT(1)
+
+#define SPE_OP_PKT_IS_INDIRECT_BRANCH(v)	(((v) & GENMASK_ULL(7, 1)) == 0x2)
+
+const char *arm_spe_pkt_name(enum arm_spe_pkt_type);
+
+int arm_spe_get_packet(const unsigned char *buf, size_t len,
+		       struct arm_spe_pkt *packet);
+
+int arm_spe_pkt_desc(const struct arm_spe_pkt *packet, char *buf, size_t len);
+#endif
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 05dda19c5359a39849ac9a3c6b1a5cf14eb0614a..85e72a392a31afefd5915040d9777187ca08350e 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -16,6 +16,10 @@
 #include <asm/cpu.h>
 #include <asm/cputype.h>
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+#include "../drivers/arm/mm_monitor/mm_spe.h"
+#endif
+
 static DEFINE_PER_CPU(struct arm_pmu *, probed_pmus);
 static DEFINE_PER_CPU(int, pmu_irqs);
 
@@ -162,6 +166,32 @@ static inline void arm_spe_acpi_register_device(void)
 {
 }
 #endif /* CONFIG_ARM_SPE_PMU */
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+static struct resource spe_mem_sampling_resources[] = {
+	{
+	}
+};
+
+static struct platform_device spe_mem_sampling_dev  = {
+	.name = ARMV8_SPE_MEM_SAMPLING_PDEV_NAME,
+	.id = -1,
+	.resource = spe_mem_sampling_resources,
+	.num_resources = ARRAY_SIZE(spe_mem_sampling_resources)
+};
+
+static void arm_spe_mem_sampling_acpi_register_device(void)
+{
+	int ret;
+
+	ret = platform_device_register(&spe_mem_sampling_dev);
+	if (ret < 0)
+		pr_warn("ACPI: SPE_MEM_SAMPLING: Unable to register device\n");
+}
+#else
+static inline void arm_spe_mem_sampling_acpi_register_device(void)
+{
+}
+#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */
 
 #if IS_ENABLED(CONFIG_CORESIGHT_TRBE)
 static struct resource trbe_resources[] = {
@@ -432,6 +462,7 @@ static int arm_pmu_acpi_init(void)
 		return 0;
 
 	arm_spe_acpi_register_device();
+	arm_spe_mem_sampling_acpi_register_device();
 	arm_trbe_acpi_register_device();
 
 	return 0;
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 71835682046e133d4a17044e12c018842337ec0b..8562385d901e6c43a7902d12646135e4afb12650 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -33,12 +33,19 @@
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+#include <linux/mem_sampling.h>
+#endif
 
 #include <asm/barrier.h>
 #include <asm/cpufeature.h>
 #include <asm/mmu.h>
 #include <asm/sysreg.h>
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+#include "../drivers/arm/mm_monitor/mm_spe.h"
+#endif
+
 /*
  * Cache if the event is allowed to trace Context information.
  * This allows us to perform the check, i.e, perf_allow_kernel(),
@@ -46,6 +53,10 @@
  */
 #define SPE_PMU_HW_FLAGS_CX			0x00001
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+static struct arm_spe_pmu *spe_pmu_local;
+#endif
+
 static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX);
 
 static void set_spe_event_has_cx(struct perf_event *event)
@@ -583,13 +594,21 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle)
 	 * If we've lost data, disable profiling and also set the PARTIAL
 	 * flag to indicate that the last record is corrupted.
 	 */
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	if (!__this_cpu_read(arm_spe_user) && FIELD_GET(PMBSR_EL1_DL, pmbsr))
+#else
 	if (FIELD_GET(PMBSR_EL1_DL, pmbsr))
+#endif
 		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED |
 					     PERF_AUX_FLAG_PARTIAL);
 
 	/* Report collisions to userspace so that it can up the period */
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	if (!__this_cpu_read(arm_spe_user) && FIELD_GET(PMBSR_EL1_DL, pmbsr))
+#else
 	if (FIELD_GET(PMBSR_EL1_COLL, pmbsr))
 		perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
+#endif
 
 	/* We only expect buffer management events */
 	switch (FIELD_GET(PMBSR_EL1_EC, pmbsr)) {
@@ -622,7 +641,12 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle)
 	ret = SPE_PMU_BUF_FAULT_ACT_FATAL;
 
 out_stop:
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	if (!__this_cpu_read(arm_spe_user))
+		arm_spe_perf_aux_output_end(handle);
+#else
 	arm_spe_perf_aux_output_end(handle);
+#endif
 	return ret;
 }
 
@@ -632,7 +656,11 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
 	struct perf_event *event = handle->event;
 	enum arm_spe_pmu_buf_fault_action act;
 
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	if (!__this_cpu_read(arm_spe_user) && !perf_get_aux(handle))
+#else
 	if (!perf_get_aux(handle))
+#endif
 		return IRQ_NONE;
 
 	act = arm_spe_pmu_buf_get_fault_act(handle);
@@ -643,7 +671,12 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
 	 * Ensure perf callbacks have completed, which may disable the
 	 * profiling buffer in response to a TRUNCATION flag.
 	 */
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	if (!__this_cpu_read(arm_spe_user))
+		irq_work_run();
+#else
 	irq_work_run();
+#endif
 
 	switch (act) {
 	case SPE_PMU_BUF_FAULT_ACT_FATAL:
@@ -663,6 +696,12 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
 		 * PMBPTR might be misaligned, but we'll burn that bridge
 		 * when we get to it.
 		 */
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+		if (__this_cpu_read(arm_spe_user)) {
+			mem_sampling_process();
+			break;
+		}
+#endif
 		if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) {
 			arm_spe_perf_aux_output_begin(handle, event);
 			isb();
@@ -758,6 +797,10 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
 
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	arm_spe_set_user(SPE_USER_PERF);
+#endif
+
 	hwc->state = 0;
 	arm_spe_perf_aux_output_begin(handle, event);
 	if (hwc->state)
@@ -797,8 +840,16 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
 
 	/* If we're already stopped, then nothing to do */
-	if (hwc->state & PERF_HES_STOPPED)
+	if (hwc->state & PERF_HES_STOPPED) {
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+		/*
+		 * PERF_HES_STOPPED maybe set in arm_spe_perf_aux_output_begin,
+		 * we switch user here.
+		 */
+		arm_spe_set_user(SPE_USER_MEM_SAMPLING);
+#endif
 		return;
+	}
 
 	/* Stop all trace generation */
 	arm_spe_pmu_disable_and_drain_local();
@@ -829,6 +880,9 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 	}
 
 	hwc->state |= PERF_HES_STOPPED;
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+	arm_spe_set_user(SPE_USER_MEM_SAMPLING);
+#endif
 }
 
 static int arm_spe_pmu_add(struct perf_event *event, int flags)
@@ -1129,6 +1183,9 @@ static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node)
 	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
 		return 0;
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+	mm_spe_percpu_buffer_alloc(cpu);
+#endif
 	__arm_spe_pmu_setup_one(spe_pmu);
 	return 0;
 }
@@ -1141,6 +1198,9 @@ static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
 	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
 		return 0;
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+	mm_spe_percpu_buffer_free(cpu);
+#endif
 	__arm_spe_pmu_stop_one(spe_pmu);
 	return 0;
 }
@@ -1176,6 +1236,9 @@ static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu)
 
 static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu)
 {
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+	mm_spe_buffer_free();
+#endif
 	cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node);
 	free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
 }
@@ -1215,6 +1278,26 @@ static const struct platform_device_id arm_spe_match[] = {
 };
 MODULE_DEVICE_TABLE(platform, arm_spe_match);
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+static bool arm_spe_get_attr(void)
+{
+	struct mm_spe *p;
+
+	p = mm_spe_get_desc();
+	if (!p) {
+		pr_err("get spe pmu cap from arm spe driver failed!\n");
+		return false;
+	}
+
+	p->supported_cpus = spe_pmu_local->supported_cpus;
+	p->irq = spe_pmu_local->irq;
+	p->features = spe_pmu_local->features;
+	p->min_period = spe_pmu_local->min_period;
+
+	return true;
+}
+#endif
+
 static int arm_spe_pmu_device_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -1249,6 +1332,21 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_free_handle;
 
+#if IS_ENABLED(CONFIG_ARM_SPE_MEM_SAMPLING)
+	/*
+	 * Ensure that all CPUs that support SPE can apply for the cache
+	 * area, with each CPU defaulting to 4K * 2. Failure to do so will
+	 * result in the inability to collect SPE data in kernel mode.
+	 */
+	ret = mm_spe_buffer_alloc();
+	if (ret)
+		goto out_teardown_dev;
+
+	spe_pmu_local = spe_pmu;
+	if (arm_spe_get_attr())
+		mm_spe_add_probe_status();
+
+#endif
 	ret = arm_spe_pmu_perf_init(spe_pmu);
 	if (ret)
 		goto out_teardown_dev;
@@ -1262,6 +1360,18 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev)
 	return ret;
 }
 
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+void arm_spe_set_user(enum arm_spe_user_e user)
+{
+	__this_cpu_write(arm_spe_user, user);
+	if (user == SPE_USER_PERF)
+		mem_sampling_user_switch_process(USER_SWITCH_AWAY_FROM_MEM_SAMPLING);
+	else
+		mem_sampling_user_switch_process(USER_SWITCH_BACK_TO_MEM_SAMPLING);
+	__arm_spe_pmu_reset_local();
+}
+#endif
+
 static int arm_spe_pmu_device_remove(struct platform_device *pdev)
 {
 	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 343132a146cf04b546a07eb6c653608d55aac9c1..e544de649dc3ec1db69b4054eeec6e0503ab4576 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -13,6 +13,7 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 #include <linux/random.h>
+#include <linux/mem_sampling.h>
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION	PAGE_SIZE
@@ -73,6 +74,9 @@ struct damon_region {
  */
 struct damon_target {
 	struct pid *pid;
+#ifdef CONFIG_DAMON_MEM_SAMPLING
+	struct damon_mem_sampling_fifo damon_fifo;
+#endif
 	unsigned int nr_regions;
 	struct list_head regions_list;
 	struct list_head list;
@@ -89,6 +93,7 @@ struct damon_target {
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
  * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
+ * @DAMOS_DEMOTION:	Migrate cold page areas to specific nodes.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  *
@@ -106,6 +111,7 @@ enum damos_action {
 	DAMOS_NOHUGEPAGE,
 	DAMOS_LRU_PRIO,
 	DAMOS_LRU_DEPRIO,
+	DAMOS_DEMOTION,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
@@ -298,6 +304,7 @@ struct damos_access_pattern {
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @pattern:		Access pattern of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @remote_node:	The NUMA node ID from which the cold page will be moved.
  * @apply_interval_us:	The time between applying the @action.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
@@ -330,6 +337,7 @@ struct damos_access_pattern {
 struct damos {
 	struct damos_access_pattern pattern;
 	enum damos_action action;
+	nodemask_t remote_node;
 	unsigned long apply_interval_us;
 /* private: internal use only */
 	/*
diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..602a5efcb77c9840becd7cdce69c24ef1841086f
--- /dev/null
+++ b/include/linux/mem_sampling.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mem_sampling.h: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+#ifndef __MEM_SAMPLING_H
+#define __MEM_SAMPLING_H
+
+#include <linux/kfifo.h>
+
+enum mem_sampling_sample_type {
+	MEM_SAMPLING_L1D_ACCESS		= 1 << 0,
+	MEM_SAMPLING_L1D_MISS		= 1 << 1,
+	MEM_SAMPLING_LLC_ACCESS		= 1 << 2,
+	MEM_SAMPLING_LLC_MISS		= 1 << 3,
+	MEM_SAMPLING_TLB_ACCESS		= 1 << 4,
+	MEM_SAMPLING_TLB_MISS		= 1 << 5,
+	MEM_SAMPLING_BRANCH_MISS	= 1 << 6,
+	MEM_SAMPLING_REMOTE_ACCESS	= 1 << 7,
+};
+
+enum mem_sampling_op_type {
+	MEM_SAMPLING_LD	= 1 << 0,
+	MEM_SAMPLING_ST	= 1 << 1,
+};
+
+enum arm_spe_user_e {
+	SPE_USER_PERF,
+	SPE_USER_MEM_SAMPLING,
+};
+DECLARE_PER_CPU(enum arm_spe_user_e, arm_spe_user);
+
+struct mem_sampling_record {
+	enum mem_sampling_sample_type	type;
+	int				err;
+	u32				op;
+	u32				latency;
+	u64				from_ip;
+	u64				to_ip;
+	u64				timestamp;
+	u64				virt_addr;
+	u64				phys_addr;
+	u64				context_id;
+	u64				boost_spe_addr[8];
+	u64				rem_addr;
+	u16				source;
+};
+
+struct mem_sampling_ops_struct {
+	int (*sampling_start)(void);
+	void (*sampling_stop)(void);
+	void (*sampling_continue)(void);
+	void (*sampling_decoding)(void);
+	struct mm_spe_buf* (*mm_spe_getbuf_addr)(void);
+	int (*mm_spe_getnum_record)(void);
+
+};
+extern struct mem_sampling_ops_struct mem_sampling_ops;
+
+enum mem_sampling_type_enum {
+	MEM_SAMPLING_ARM_SPE,
+	MEM_SAMPLING_UNSUPPORTED
+};
+
+enum user_switch_type {
+	USER_SWITCH_AWAY_FROM_MEM_SAMPLING,
+	USER_SWITCH_BACK_TO_MEM_SAMPLING,
+};
+
+DECLARE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);
+extern struct static_key_false mem_sampling_access_hints;
+
+#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
+int mm_spe_start(void);
+void mm_spe_stop(void);
+void mm_spe_continue(void);
+void mm_spe_decoding(void);
+int mm_spe_getnum_record(void);
+struct mm_spe_buf *mm_spe_getbuf_addr(void);
+int mm_spe_enabled(void);
+void arm_spe_set_probe_status(int status);
+#else
+static inline void mm_spe_stop(void) { }
+static inline void mm_spe_continue(void) { }
+static inline void mm_spe_decoding(void) { }
+static inline void arm_spe_set_probe_status(int status) { }
+static inline int mm_spe_start(void) { return 0; }
+static inline int mm_spe_getnum_record(void) { return 0; }
+static inline struct mm_spe_buf *mm_spe_getbuf_addr(void) { return NULL; }
+static inline int mm_spe_enabled(void) { return 0; }
+#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */
+
+#if IS_ENABLED(CONFIG_MEM_SAMPLING)
+void mem_sampling_process(void);
+void arm_spe_set_user(enum arm_spe_user_e user);
+void set_mem_sampling_state(bool enabled);
+void mem_sampling_user_switch_process(enum user_switch_type type);
+void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr);
+#else
+static inline void set_mem_sampling_state(bool enabled) { }
+static inline void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) { }
+#endif /* CONFIG_MEM_SAMPLING */
+
+#ifdef CONFIG_DAMON_MEM_SAMPLING
+#define	DAMOS_FIFO_MAX_RECORD		(1024)
+struct damon_mem_sampling_record {
+	u64 vaddr;
+};
+
+struct damon_mem_sampling_fifo {
+	struct kfifo rx_kfifo;
+	spinlock_t rx_kfifo_lock; /* protect SPE Rx data kfifo */
+};
+
+bool damon_use_mem_sampling(void);
+#else
+static inline bool damon_use_mem_sampling(void) { return false; }
+#endif /* CONFIG_DAMON_MEM_SAMPLING */
+#endif	/* __MEM_SAMPLING_H */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 2e81ac87e6f6d91610efb3daafa2c33187bab8b3..29cc0d842a8fa81ec2dad3a5d4357a2bb94c45e0 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -167,6 +167,8 @@ static inline void check_highest_zone(enum zone_type k)
 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 		     const nodemask_t *to, int flags);
 
+int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, unsigned long start, unsigned long end, int flags);
 
 #ifdef CONFIG_TMPFS
 extern int mpol_parse_str(char *str, struct mempolicy **mpol);
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index f37cc03f9369ed61e49f7be69564fa017ab51801..302c659dc626c92ba258c62671f4cca97412209c 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -29,6 +29,7 @@ enum migrate_reason {
 	MR_CONTIG_RANGE,
 	MR_LONGTERM_PIN,
 	MR_DEMOTION,
+	MR_DAMON_DEMOTION,
 	MR_TYPES
 };
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b4442fbbf17bf7df534f03c3807d762dc17209c6..64c38b09e18d5579dd362cc160f68d6535c70428 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1011,7 +1011,11 @@ struct mm_struct {
 #endif
 	} __randomize_layout;
 
+#ifdef CONFIG_DAMON_MEM_SAMPLING
+	KABI_USE(1, struct damon_mem_sampling_fifo *damon_fifo)
+#else
 	KABI_RESERVE(1)
+#endif
 	KABI_RESERVE(2)
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index a4e40ae6a8c8fd2e51cdbf877cf3bb485690cc9a..4bcbf613c9a3852256737fd19e1bf8937a172ccf 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 #include <trace/events/mmflags.h>
+#include <linux/mem_sampling.h>
 
 TRACE_EVENT(kmem_cache_alloc,
 
@@ -409,6 +410,117 @@ TRACE_EVENT(rss_stat,
 		__print_symbolic(__entry->member, TRACE_MM_PAGES),
 		__entry->size)
 	);
+#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
+TRACE_EVENT(mm_spe_record,
+	TP_PROTO(struct mem_sampling_record *record),
+
+	TP_ARGS(record),
+
+	TP_STRUCT__entry(
+		__field(u64, vaddr)
+		__field(u64, paddr)
+		__field(int, pid)
+	),
+
+	TP_fast_assign(
+		__entry->vaddr = record->virt_addr;
+		__entry->paddr = record->phys_addr;
+		__entry->pid = record->context_id;
+
+	),
+
+	TP_printk("vaddr=%llu paddr=%llu pid=%d",
+		__entry->vaddr, __entry->paddr, __entry->pid)
+);
+
+TRACE_EVENT(spe_boost_spe_record,
+	TP_PROTO(struct mem_sampling_record *record),
+
+	TP_ARGS(record),
+
+	TP_STRUCT__entry(
+		__field(u64, boost_spe_pa1)
+		__field(u64, boost_spe_pa2)
+		__field(u64, boost_spe_pa3)
+		__field(u64, boost_spe_pa4)
+		__field(u64, boost_spe_pa5)
+		__field(u64, boost_spe_pa6)
+		__field(u64, boost_spe_pa7)
+		__field(u64, boost_spe_pa8)
+	),
+
+	TP_fast_assign(
+		__entry->boost_spe_pa1 = record->boost_spe_addr[0];
+		__entry->boost_spe_pa2 = record->boost_spe_addr[1];
+		__entry->boost_spe_pa3 = record->boost_spe_addr[2];
+		__entry->boost_spe_pa4 = record->boost_spe_addr[3];
+		__entry->boost_spe_pa5 = record->boost_spe_addr[4];
+		__entry->boost_spe_pa6 = record->boost_spe_addr[5];
+		__entry->boost_spe_pa7 = record->boost_spe_addr[6];
+		__entry->boost_spe_pa8 = record->boost_spe_addr[7];
+	),
+
+	TP_printk("boost_spe_addr[0]=0x%llx boost_spe_addr[1]=0x%llx tlb_addr[2]=0x%llx tlb_addr[3]=0x%llx tlb_addr[4]=0x%llx tlb_addr[5]=0x%llx tlb_addr[6]=0x%llx tlb_addr[7]=0x%llx",
+		__entry->boost_spe_pa1, __entry->boost_spe_pa2,
+		__entry->boost_spe_pa3, __entry->boost_spe_pa4,
+		__entry->boost_spe_pa5, __entry->boost_spe_pa6,
+		__entry->boost_spe_pa7, __entry->boost_spe_pa8)
+);
+#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */
+
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+TRACE_EVENT(mm_numa_migrating,
+
+	TP_PROTO(u64 vaddr, int page_nid, int target_nid,
+		int migrate_success),
+
+	TP_ARGS(vaddr, page_nid, target_nid, migrate_success),
+
+	TP_STRUCT__entry(
+		__field(u64, vaddr)
+		__field(int, page_nid)
+		__field(int, target_nid)
+		__field(int, migrate_success)
+	),
+
+	TP_fast_assign(
+		__entry->vaddr = vaddr;
+		__entry->page_nid = page_nid;
+		__entry->target_nid = target_nid;
+		__entry->migrate_success = !!(migrate_success);
+	),
+
+	TP_printk("vaddr=%llu page_nid=%d target_nid=%d migrate_success=%d",
+		__entry->vaddr, __entry->page_nid,
+		__entry->target_nid, __entry->migrate_success)
+);
+
+TRACE_EVENT(mm_mem_sampling_access_record,
+
+	TP_PROTO(u64 vaddr, u64 paddr, int cpuid, int pid),
+
+	TP_ARGS(vaddr, paddr, cpuid, pid),
+
+	TP_STRUCT__entry(
+		__field(u64, vaddr)
+		__field(u64, paddr)
+		__field(int, cpuid)
+		__field(int, pid)
+	),
+
+	TP_fast_assign(
+		__entry->vaddr = vaddr;
+		__entry->paddr = paddr;
+		__entry->cpuid = cpuid;
+		__entry->pid = pid;
+	),
+
+	TP_printk("vaddr=%llu paddr=%llu cpuid=%d pid=%d",
+		__entry->vaddr, __entry->paddr,
+		__entry->cpuid, __entry->pid)
+);
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 0190ef725b439696973eaefb75e453c360535117..bafe4208de73328a09258eac0385a4c14571fbfb 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -22,7 +22,8 @@
 	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
 	EM( MR_CONTIG_RANGE,	"contig_range")			\
 	EM( MR_LONGTERM_PIN,	"longterm_pin")			\
-	EMe(MR_DEMOTION,	"demotion")
+	EM(MR_DEMOTION,		"demotion")			\
+	EMe(MR_DAMON_DEMOTION,	"damon_demotion")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/kernel/fork.c b/kernel/fork.c
index 698d7829f2e448d5684fad1d8ad0c593c9755c3e..4b37cb915f7b8d0d9a8e7571bf92a259a9d25287 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1362,6 +1362,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
+#endif
+#if defined(CONFIG_DAMON_MEM_SAMPLING)
+	mm->damon_fifo = NULL;
 #endif
 	mm_init_uprobes_state(mm);
 	hugetlb_count_init(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7dc4ceebd5ec0e9ecddb4fd28ec139aa63858d02..7c40690ad56f4e4ad4d2986ce71e96923f6540ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -64,6 +64,7 @@
 #include <linux/vtime.h>
 #include <linux/wait_api.h>
 #include <linux/workqueue_api.h>
+#include <linux/mem_sampling.h>
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 # ifdef CONFIG_GENERIC_ENTRY
@@ -5307,6 +5308,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = READ_ONCE(prev->__state);
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	mem_sampling_sched_in(prev, current);
 	finish_task(prev);
 	tick_nohz_task_switch();
 	finish_lock_switch(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c530d501bb48637e6b5c6ac41eba34b73f745c9f..468a4d747933678015a5420df79f8b19e98b1fe7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -48,6 +48,7 @@
 #include <linux/ratelimit.h>
 #include <linux/task_work.h>
 #include <linux/rbtree_augmented.h>
+#include <linux/mem_sampling.h>
 
 #include <asm/switch_to.h>
 
@@ -3368,6 +3369,18 @@ static void task_numa_work(struct callback_head *work)
 	long pages, virtpages;
 	struct vma_iterator vmi;
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+	/*
+	 * If we are using access hints from hardware (like using
+	 * SPE), don't scan the address space.
+	 * Note that currently PMD-level page migration is not
+	 * supported.
+	 */
+	if (static_branch_unlikely(&mem_sampling_access_hints) &&
+		static_branch_unlikely(&sched_numabalancing_mem_sampling))
+		return;
+#endif
+
 	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 
 	work->next = work;
diff --git a/mm/Kconfig b/mm/Kconfig
index 56171b9dd8730088ab7028d2902183d0ce9a6290..88addd002bb5d29f6bb0361c03e6c2007e326c1f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1452,6 +1452,34 @@ config BPF_READAHEAD
 	  of the kernel is adjusted based on the application read mode to optimize
 	  the read performance in the Spark SQL scenario,
 
+config MEM_SAMPLING
+	bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)"
+	default n
+	depends on ARM64
+	select ARM_SPE_MEM_SAMPLING if ARM64
+	help
+	  This option enables hardware-based memory sampling for kernel features
+	  such as NUMA balancing and DAMON. If disabled, software-based memory
+	  sampling will be used instead.
+
+	  Memory sampling is primarily based on specific hardware capabilities,
+	  which enable hardware PMUs to sample memory access for use by kernel
+	  features. It requires at least one hardware PMU (e.g. ARM_SPE_MEM_SAMPLING)
+	  to be enabled.
+
+config NUMABALANCING_MEM_SAMPLING
+	bool "Use hardware memory samples for numa balancing"
+	depends on MEM_SAMPLING && NUMA_BALANCING
+	default n
+	help
+	  This feature relies on hardware sampling, and will use memory access
+	  information obtained from hardware sampling in the NUMA balancing
+	  policy instead of the native software PROT_NONE scheme. Turning on
+	  this feature may have a performance impact on some workloads, for
+	  example, lightweight memory access programs.
+
+	  if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 11df2de8fdbe9d5a70e4ca3a73db68ebd9c9d331..674777b7c99ff0952edadda1c2ebb3b0b12e9f1b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,3 +147,4 @@ obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
 obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
 obj-$(CONFIG_DYNAMIC_POOL)	+= dynamic_pool.o
+obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 436c6b4cb5ec57fb7e47d093569ec093d7ec7401..d6ed1ef6ad4a5e76210c2daad5207be9b2743bfd 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -32,6 +32,20 @@ config DAMON_VADDR
 	  This builds the default data access monitoring operations for DAMON
 	  that work for virtual address spaces.
 
+config DAMON_MEM_SAMPLING
+	bool "Set DAMON to use records from hardware sample"
+	depends on MEM_SAMPLING && DAMON_VADDR
+	help
+	  This enables DAMON to utilize hardware sampling-based memory access
+	  monitoring data (e.g., ARM SPE, Intel PEBS, AMD IBS) instead of
+	  software-based sampling. When enabled, DAMON will:
+
+	  - Use CPU performance monitoring unit (PMU) samples as data source
+	  - Correlate hardware samples with process virtual address spaces
+	  - Provide lower overhead monitoring compared to pure software approaches
+
+	  If unsure, say N.
+
 config DAMON_PADDR
 	bool "Data access monitoring operations for the physical address space"
 	depends on DAMON && MMU
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 1daa8793c44b3c1256e0255d158b3d33741c5535..c8a4427d1d630617072a076ae311ad640c422619 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -112,6 +112,32 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
 	return err;
 }
 
+#if IS_ENABLED(CONFIG_DAMON_MEM_SAMPLING)
+int damon_target_init_kfifo(struct damon_target *t)
+{
+	struct damon_mem_sampling_fifo *damon_fifo;
+	int ret = 0;
+	unsigned int fifo_size = sizeof(struct damon_mem_sampling_record) * DAMOS_FIFO_MAX_RECORD;
+
+	damon_fifo = &t->damon_fifo;
+
+	ret = kfifo_alloc(&damon_fifo->rx_kfifo, fifo_size, GFP_KERNEL);
+	if (ret)
+		return -ENOMEM;
+
+	spin_lock_init(&damon_fifo->rx_kfifo_lock);
+	return 0;
+}
+
+void damon_target_deinit_kfifo(struct damon_target *t)
+{
+	kfifo_free(&t->damon_fifo.rx_kfifo);
+}
+#else
+static inline int damon_target_init_kfifo(struct damon_target *t) {return 0; }
+static inline void damon_target_deinit_kfifo(struct damon_target *t) { }
+#endif /* CONFIG_DAMON_MEM_SAMPLING */
+
 /*
  * Construct a damon_region struct
  *
@@ -388,11 +414,18 @@ void damon_destroy_scheme(struct damos *s)
 struct damon_target *damon_new_target(void)
 {
 	struct damon_target *t;
+	int ret;
 
 	t = kmalloc(sizeof(*t), GFP_KERNEL);
 	if (!t)
 		return NULL;
 
+	ret = damon_target_init_kfifo(t);
+	if (ret) {
+		kfree(t);
+		return NULL;
+	}
+
 	t->pid = NULL;
 	t->nr_regions = 0;
 	INIT_LIST_HEAD(&t->regions_list);
@@ -422,6 +455,7 @@ void damon_free_target(struct damon_target *t)
 
 	damon_for_each_region_safe(r, next, t)
 		damon_free_region(r);
+	damon_target_deinit_kfifo(t);
 	kfree(t);
 }
 
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 26c948f87489ee1cdf9d97de9877ebc23185a8ed..dc570e90abca73c275cc3609467c6ca6c4678de8 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1123,6 +1123,7 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = {
 struct damon_sysfs_scheme {
 	struct kobject kobj;
 	enum damos_action action;
+	nodemask_t remote_node;
 	struct damon_sysfs_access_pattern *access_pattern;
 	struct damon_sysfs_quotas *quotas;
 	struct damon_sysfs_watermarks *watermarks;
@@ -1140,6 +1141,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"nohugepage",
 	"lru_prio",
 	"lru_deprio",
+	"demotion",
 	"stat",
 };
 
@@ -1153,6 +1155,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
 		return NULL;
 	scheme->kobj = (struct kobject){};
 	scheme->action = action;
+	scheme->remote_node = NODE_MASK_ALL;
 	return scheme;
 }
 
@@ -1356,6 +1359,36 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
 	return -EINVAL;
 }
 
+static ssize_t remote_node_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+
+	return sysfs_emit(buf, "%*pbl\n",
+			nodemask_pr_args(&scheme->remote_node));
+}
+
+static ssize_t remote_node_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+	int ret;
+	nodemask_t new_mask;
+
+	ret = nodelist_parse(buf, new_mask);
+	if (ret < 0)
+		return -EINVAL;
+
+	if (!nodes_subset(new_mask, node_states[N_MEMORY]))
+		return -EINVAL;
+
+	nodes_and(scheme->remote_node, new_mask, node_states[N_MEMORY]);
+	return count;
+}
+
+
 static void damon_sysfs_scheme_release(struct kobject *kobj)
 {
 	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
@@ -1364,8 +1397,12 @@ static void damon_sysfs_scheme_release(struct kobject *kobj)
 static struct kobj_attribute damon_sysfs_scheme_action_attr =
 		__ATTR_RW_MODE(action, 0600);
 
+static struct kobj_attribute damon_sysfs_scheme_remote_node_attr =
+		__ATTR_RW_MODE(remote_node, 0600);
+
 static struct attribute *damon_sysfs_scheme_attrs[] = {
 	&damon_sysfs_scheme_action_attr.attr,
+	&damon_sysfs_scheme_remote_node_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(damon_sysfs_scheme);
@@ -1644,6 +1681,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
 	scheme->pattern.max_age_region = access_pattern->age->max;
 
 	scheme->action = sysfs_scheme->action;
+	scheme->remote_node = sysfs_scheme->remote_node;
 
 	scheme->quota.ms = sysfs_quotas->ms;
 	scheme->quota.sz = sysfs_quotas->sz;
@@ -1687,6 +1725,8 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx,
 				damon_destroy_scheme(scheme);
 			return -ENOMEM;
 		}
+
+		scheme->remote_node = sysfs_schemes->schemes_arr[i]->remote_node;
 		damon_add_scheme(ctx, scheme);
 	}
 	return 0;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 5764b9885e7d215cbae84bc64c2b79508c2d260d..3a21410e631e7e894d45bb6c48ad7c34a9bbe9e9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -14,6 +14,7 @@
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
+#include <linux/cpuset.h>
 
 #include "ops-common.h"
 
@@ -402,6 +403,118 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
 	mmap_read_unlock(mm);
 }
 
+#if IS_ENABLED(CONFIG_DAMON_MEM_SAMPLING)
+/*
+ * Functions for the access checking of the regions with mem sampling
+ */
+static void __hw_damon_va_prepare_access_check(struct damon_region *r)
+{
+	r->sampling_addr = 0;
+}
+
+static void hw_damon_va_prepare_access_checks(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	struct mm_struct *mm;
+	struct damon_region *r;
+
+	damon_for_each_target(t, ctx) {
+		mm = damon_get_mm(t);
+		if (!mm)
+			continue;
+		mm->damon_fifo = &t->damon_fifo;
+		damon_for_each_region(r, t)
+			__hw_damon_va_prepare_access_check(r);
+		mmput(mm);
+	}
+}
+
+static void find_damon_region(struct damon_mem_sampling_record *damon_record,
+		  struct damon_target *t, unsigned int *max_nr_accesses)
+{
+	struct damon_region *r;
+	unsigned long addr = damon_record->vaddr;
+
+	damon_for_each_region(r, t) {
+		if (r->sampling_addr != 0)
+			return;
+		if (addr > r->ar.start && addr < r->ar.end) {
+			r->nr_accesses++;
+			r->sampling_addr = addr;
+			*max_nr_accesses = max(r->nr_accesses, *max_nr_accesses);
+			return;
+		}
+	}
+}
+
+static unsigned int hw_damon_va_check_accesses(struct damon_ctx *ctx)
+{
+	unsigned int outs;
+	struct damon_target *t;
+	struct mm_struct *mm;
+	unsigned int max_nr_accesses = 0;
+	struct damon_mem_sampling_record damon_record;
+
+	damon_for_each_target(t, ctx) {
+		mm = damon_get_mm(t);
+		if (!mm)
+			continue;
+		mm->damon_fifo = NULL;
+		mmput(mm);
+		while (!kfifo_is_empty(&t->damon_fifo.rx_kfifo)) {
+			outs = kfifo_out(&t->damon_fifo.rx_kfifo, &damon_record,
+					 sizeof(struct damon_mem_sampling_record));
+			if (outs != sizeof(struct damon_mem_sampling_record)) {
+				pr_debug("damon hw spe record corrupted header. Flush.\n");
+				continue;
+			}
+			find_damon_region(&damon_record, t, &max_nr_accesses);
+		}
+		kfifo_reset_out(&t->damon_fifo.rx_kfifo);
+	}
+
+	return max_nr_accesses;
+}
+#else
+static inline void hw_damon_va_prepare_access_checks(struct damon_ctx *ctx) { }
+static inline unsigned int hw_damon_va_check_accesses(struct damon_ctx *ctx) {return 0; }
+#endif
+
+#ifdef CONFIG_MIGRATION
+static unsigned long damon_migrate_pages(struct damon_target *t,
+			struct damon_region *r, nodemask_t task_remote_nodes)
+{
+	struct mm_struct *mm = NULL;
+	unsigned long applied;
+	struct task_struct *task;
+	nodemask_t task_nodes;
+
+	task = damon_get_task_struct(t);
+	if (!task)
+		return 0;
+	task_nodes = cpuset_mems_allowed(task);
+	put_task_struct(task);
+
+	mm = damon_get_mm(t);
+	if (!mm)
+		return 0;
+
+	applied = do_migrate_area_pages(mm, &task_nodes, &task_remote_nodes,
+				r->ar.start, r->ar.end, MPOL_MF_MOVE_ALL);
+
+	mmput(mm);
+
+	return applied;
+}
+
+#else
+static inline unsigned long damon_migrate_pages(struct damon_target *t,
+			struct damon_region *r, nodemask_t task_remote_nodes)
+{
+	return 0;
+}
+#endif /* CONFIG_MIGRATION */
+
 /*
  * Functions for the access checking of the regions
  */
@@ -420,6 +533,11 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 	struct mm_struct *mm;
 	struct damon_region *r;
 
+	if (damon_use_mem_sampling()) {
+		hw_damon_va_prepare_access_checks(ctx);
+		return;
+	}
+
 	damon_for_each_target(t, ctx) {
 		mm = damon_get_mm(t);
 		if (!mm)
@@ -589,6 +707,11 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
 	unsigned int max_nr_accesses = 0;
 	bool same_target;
 
+	if (damon_use_mem_sampling()) {
+		max_nr_accesses = hw_damon_va_check_accesses(ctx);
+		return max_nr_accesses;
+	}
+
 	damon_for_each_target(t, ctx) {
 		mm = damon_get_mm(t);
 		if (!mm)
@@ -670,6 +793,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 	case DAMOS_NOHUGEPAGE:
 		madv_action = MADV_NOHUGEPAGE;
 		break;
+	case DAMOS_DEMOTION:
+		return damon_migrate_pages(t, r, scheme->remote_node);
 	case DAMOS_STAT:
 		return 0;
 	default:
@@ -690,6 +815,8 @@ static int damon_va_scheme_score(struct damon_ctx *context,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_cold_score(context, r, scheme);
+	case DAMOS_DEMOTION:
+		return damon_cold_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c
new file mode 100644
index 0000000000000000000000000000000000000000..126cf71a9fb25dd2a62b5fe265a6fe10c6aed533
--- /dev/null
+++ b/mm/mem_sampling.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mem_sampling.c: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+
+#define pr_fmt(fmt) "mem_sampling: " fmt
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mem_sampling.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
+#include <linux/migrate.h>
+#include <trace/events/kmem.h>
+#include <linux/sched/numa_balancing.h>
+
+#define MEM_SAMPLING_DISABLED		0x0
+#define MEM_SAMPLING_NORMAL		0x1
+#define MEM_SAMPLING_MIN_VALUE		0
+#define MEM_SAMPLING_MAX_VALUE		5
+
+struct mem_sampling_ops_struct mem_sampling_ops;
+static int mem_sampling_override __initdata;
+static int sysctl_mem_sampling_mode;
+
+static const int mem_sampling_min_value = MEM_SAMPLING_MIN_VALUE;
+static const int mem_sampling_max_value = MEM_SAMPLING_MAX_VALUE;
+
+/* keep track of who use the SPE */
+DEFINE_PER_CPU(enum arm_spe_user_e, arm_spe_user);
+EXPORT_PER_CPU_SYMBOL_GPL(arm_spe_user);
+
+enum mem_sampling_saved_state_e {
+	MEM_SAMPLING_STATE_ENABLE,
+	MEM_SAMPLING_STATE_DISABLE,
+	MEM_SAMPLING_STATE_EMPTY,
+};
+enum mem_sampling_saved_state_e mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY;
+
+/*
+ * Callbacks should be registered using mem_sampling_record_cb_register()
+ * by NUMA, DAMON and etc during their initialisation.
+ * Callbacks will be invoked on new hardware pmu records caputured.
+ */
+typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record);
+
+struct mem_sampling_record_cb_list_entry {
+	struct list_head list;
+	mem_sampling_record_cb_type cb;
+};
+LIST_HEAD(mem_sampling_record_cb_list);
+
+struct mem_sampling_numa_access_work {
+	struct callback_head work;
+	u64 vaddr, paddr;
+	int cpu;
+};
+
+void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb)
+			return;
+	}
+
+	cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL);
+	if (!cb_entry)
+		return;
+
+	cb_entry->cb = cb;
+	list_add(&(cb_entry->list), &mem_sampling_record_cb_list);
+}
+
+void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb) {
+			list_del(&cb_entry->list);
+			kfree(cb_entry);
+			return;
+		}
+	}
+}
+
+DEFINE_STATIC_KEY_FALSE(mem_sampling_access_hints);
+void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr)
+{
+	if (!static_branch_unlikely(&mem_sampling_access_hints))
+		return;
+
+	if (!mem_sampling_ops.sampling_start)
+		return;
+
+	if (curr->mm)
+		mem_sampling_ops.sampling_start();
+	else
+		mem_sampling_ops.sampling_stop();
+}
+
+DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+static int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+		      unsigned long addr, int page_nid, int *flags)
+{
+	folio_get(folio);
+
+	/* Record the current PID acceesing VMA */
+	vma_set_access_pid_bit(vma);
+
+	count_vm_numa_event(NUMA_HINT_FAULTS);
+	if (page_nid == numa_node_id()) {
+		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+		*flags |= TNF_FAULT_LOCAL;
+	}
+
+	return mpol_misplaced(folio, vma, addr);
+}
+
+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by SPE) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+static void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr)
+{
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+	struct page *page = NULL;
+	struct folio *folio;
+	int page_nid = NUMA_NO_NODE;
+	int last_cpupid;
+	int target_nid;
+	int flags = 0;
+
+	if (!mm)
+		return;
+
+	if (!mmap_read_trylock(mm))
+		return;
+
+	vma = find_vma(mm, laddr);
+	if (!vma)
+		goto out_unlock;
+
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+		goto out_unlock;
+
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		goto out_unlock;
+
+	if (!vma_is_accessible(vma))
+		goto out_unlock;
+
+	page = pfn_to_online_page(PHYS_PFN(paddr));
+	folio = page_folio(page);
+
+	if (!folio || folio_is_zone_device(folio))
+		goto out_unlock;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_unlock;
+
+	/* TODO: handle PTE-mapped THP or PMD-mapped THP*/
+	if (folio_test_large(folio))
+		goto out_unlock;
+
+	/*
+	 * Flag if the page is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+		flags |= TNF_SHARED;
+
+	page_nid = folio_nid(folio);
+
+	/*
+	 * For memory tiering mode, cpupid of slow memory page is used
+	 * to record page access time.  So use default value.
+	 */
+	if (folio_use_access_time(folio))
+		last_cpupid = (-1 & LAST_CPUPID_MASK);
+	else
+		last_cpupid = folio_last_cpupid(folio);
+	target_nid = numa_migrate_prep(folio, vma, laddr, page_nid, &flags);
+	if (target_nid == NUMA_NO_NODE) {
+		folio_put(folio);
+		goto out;
+	}
+
+	/* Migrate to the requested node */
+	if (migrate_misplaced_folio(folio, vma, target_nid)) {
+		page_nid = target_nid;
+		flags |= TNF_MIGRATED;
+	} else {
+		flags |= TNF_MIGRATE_FAIL;
+	}
+
+out:
+	trace_mm_numa_migrating(laddr, page_nid, target_nid, flags&TNF_MIGRATED);
+	if (page_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+	mmap_read_unlock(mm);
+}
+
+static void task_mem_sampling_access_work(struct callback_head *work)
+{
+	struct mem_sampling_numa_access_work *iwork =
+		container_of(work, struct mem_sampling_numa_access_work, work);
+
+	if (iwork->cpu == smp_processor_id())
+		do_numa_access(current, iwork->vaddr, iwork->paddr);
+	kfree(iwork);
+}
+
+static void numa_create_taskwork(u64 vaddr, u64 paddr, int cpu)
+{
+	struct mem_sampling_numa_access_work *iwork = NULL;
+
+	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+	if (!iwork)
+		return;
+
+	iwork->vaddr = vaddr;
+	iwork->paddr = paddr;
+	iwork->cpu = cpu;
+
+	init_task_work(&iwork->work, task_mem_sampling_access_work);
+	task_work_add(current, &iwork->work, TWA_RESUME);
+}
+
+static void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record)
+{
+	struct task_struct *p = current;
+	u64 vaddr = record->virt_addr;
+	u64 paddr = record->phys_addr;
+
+	/* Discard kernel address accesses */
+	if (vaddr & (1UL << 63))
+		return;
+
+	if (p->pid != record->context_id)
+		return;
+
+	trace_mm_mem_sampling_access_record(vaddr, paddr, smp_processor_id(),
+					current->pid);
+	numa_create_taskwork(vaddr, paddr, smp_processor_id());
+}
+
+static void numa_balancing_mem_sampling_cb_register(void)
+{
+	mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb);
+}
+
+static void numa_balancing_mem_sampling_cb_unregister(void)
+{
+	mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb);
+}
+static void set_numabalancing_mem_sampling_state(bool enabled)
+{
+	if (enabled) {
+		numa_balancing_mem_sampling_cb_register();
+		static_branch_enable(&sched_numabalancing_mem_sampling);
+	} else {
+		numa_balancing_mem_sampling_cb_unregister();
+		static_branch_disable(&sched_numabalancing_mem_sampling);
+	}
+}
+#else
+static inline void set_numabalancing_mem_sampling_state(bool enabled) { }
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
+DEFINE_STATIC_KEY_FALSE(mm_damon_mem_sampling);
+#ifdef CONFIG_DAMON_MEM_SAMPLING
+static void damon_mem_sampling_record_cb(struct mem_sampling_record *record)
+{
+	struct damon_mem_sampling_fifo *damon_fifo;
+	struct damon_mem_sampling_record domon_record;
+	struct task_struct *task = NULL;
+	struct mm_struct *mm;
+
+	/* Discard kernel address accesses */
+	if (record->virt_addr & (1UL << 63))
+		return;
+
+	task = find_get_task_by_vpid((pid_t)record->context_id);
+	if (!task)
+		return;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		return;
+
+	damon_fifo = mm->damon_fifo;
+	mmput(mm);
+
+	domon_record.vaddr = record->virt_addr;
+
+	/* only the proc under monitor now has damon_fifo */
+	if (damon_fifo) {
+		if (kfifo_is_full(&damon_fifo->rx_kfifo))
+			return;
+
+		kfifo_in_locked(&damon_fifo->rx_kfifo, &domon_record,
+				sizeof(struct damon_mem_sampling_record),
+				&damon_fifo->rx_kfifo_lock);
+		return;
+	}
+}
+
+static void damon_mem_sampling_record_cb_register(void)
+{
+	mem_sampling_record_cb_register(damon_mem_sampling_record_cb);
+}
+
+static void damon_mem_sampling_record_cb_unregister(void)
+{
+	mem_sampling_record_cb_unregister(damon_mem_sampling_record_cb);
+}
+
+static void set_damon_mem_sampling_state(bool enabled)
+{
+	if (enabled) {
+		damon_mem_sampling_record_cb_register();
+		static_branch_enable(&mm_damon_mem_sampling);
+	} else {
+		damon_mem_sampling_record_cb_unregister();
+		static_branch_disable(&mm_damon_mem_sampling);
+	}
+}
+
+bool damon_use_mem_sampling(void)
+{
+	return static_branch_unlikely(&mem_sampling_access_hints) &&
+			static_branch_unlikely(&mm_damon_mem_sampling);
+}
+#else
+static inline void set_damon_mem_sampling_state(bool enabled) { }
+#endif
+
+void mem_sampling_process(void)
+{
+	int i, nr_records;
+	struct mem_sampling_record *record;
+	struct mem_sampling_record *record_base;
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	mem_sampling_ops.sampling_decoding();
+
+	record_base = (struct mem_sampling_record *)mem_sampling_ops.mm_spe_getbuf_addr();
+	nr_records = mem_sampling_ops.mm_spe_getnum_record();
+
+	if (list_empty(&mem_sampling_record_cb_list))
+		goto out;
+
+	for (i = 0; i < nr_records; i++) {
+		record = record_base + i;
+		list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+			cb_entry->cb(record);
+		}
+	}
+out:
+	/* if mem_sampling_access_hints is set to false, stop sampling */
+	if (static_branch_unlikely(&mem_sampling_access_hints))
+		mem_sampling_ops.sampling_continue();
+	else
+		mem_sampling_ops.sampling_stop();
+}
+EXPORT_SYMBOL_GPL(mem_sampling_process);
+
+static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
+{
+#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
+	return MEM_SAMPLING_ARM_SPE;
+#else
+	return MEM_SAMPLING_UNSUPPORTED;
+#endif
+}
+
+static void __set_mem_sampling_state(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&mem_sampling_access_hints);
+	else {
+		static_branch_disable(&mem_sampling_access_hints);
+		set_numabalancing_mem_sampling_state(enabled);
+		set_damon_mem_sampling_state(enabled);
+	}
+}
+
+void set_mem_sampling_state(bool enabled)
+{
+	if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY) {
+		mem_sampling_saved_state = enabled ? MEM_SAMPLING_STATE_ENABLE :
+					    MEM_SAMPLING_STATE_DISABLE;
+		return;
+	}
+
+	if (!mem_sampling_ops.sampling_start || !mm_spe_enabled())
+		return;
+	if (enabled)
+		sysctl_mem_sampling_mode = MEM_SAMPLING_NORMAL;
+	else
+		sysctl_mem_sampling_mode = MEM_SAMPLING_DISABLED;
+	__set_mem_sampling_state(enabled);
+}
+
+void mem_sampling_user_switch_process(enum user_switch_type type)
+{
+	bool state;
+	int mm_spe_perf_user_count = 0;
+	int cpu;
+
+	if (type > USER_SWITCH_BACK_TO_MEM_SAMPLING) {
+		pr_err("user switch type error.\n");
+		return;
+	}
+
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(arm_spe_user, cpu) == SPE_USER_PERF)
+			mm_spe_perf_user_count++;
+	}
+
+	if (type == USER_SWITCH_AWAY_FROM_MEM_SAMPLING) {
+		/* save state only the status when leave mem_sampling for the first time */
+		if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY)
+			return;
+
+		if (static_branch_unlikely(&mem_sampling_access_hints))
+			mem_sampling_saved_state = MEM_SAMPLING_STATE_ENABLE;
+		else
+			mem_sampling_saved_state = MEM_SAMPLING_STATE_DISABLE;
+
+		pr_debug("user switch away from mem_sampling, %s is saved, set to disable.\n",
+				mem_sampling_saved_state ? "disabled" : "enabled");
+
+		set_mem_sampling_state(false);
+	} else {
+		/* If the state is not backed up, do not restore it */
+		if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY || mm_spe_perf_user_count)
+			return;
+
+		state = (mem_sampling_saved_state == MEM_SAMPLING_STATE_ENABLE) ? true : false;
+		set_mem_sampling_state(state);
+		mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY;
+
+		pr_debug("user switch back to mem_sampling, set to saved %s.\n",
+				state ? "enalbe" : "disable");
+	}
+}
+EXPORT_SYMBOL_GPL(mem_sampling_user_switch_process);
+
+#ifdef CONFIG_PROC_SYSCTL
+static int proc_mem_sampling_enable(struct ctl_table *table, int write,
+			  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = 0;
+
+	if (static_branch_likely(&mem_sampling_access_hints))
+		state = 1;
+	if (static_branch_likely(&sched_numabalancing_mem_sampling))
+		state = 2;
+	if (static_branch_likely(&mm_damon_mem_sampling))
+		state = 3;
+	if (static_branch_likely(&mm_damon_mem_sampling) &&
+		static_branch_likely(&sched_numabalancing_mem_sampling))
+		state = 4;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	t.extra1 = (int *)&mem_sampling_min_value;
+	t.extra2 = (int *)&mem_sampling_max_value;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write) {
+		switch (state) {
+		case 0:
+			set_mem_sampling_state(false);
+			break;
+		case 1:
+			set_mem_sampling_state(false);
+			set_mem_sampling_state(true);
+			break;
+		case 2:
+			set_mem_sampling_state(false);
+			set_mem_sampling_state(true);
+			set_numabalancing_mem_sampling_state(true);
+			break;
+		case 3:
+			set_mem_sampling_state(false);
+			set_mem_sampling_state(true);
+			set_damon_mem_sampling_state(true);
+			break;
+		case 4:
+			set_mem_sampling_state(true);
+			set_numabalancing_mem_sampling_state(true);
+			set_damon_mem_sampling_state(true);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return err;
+}
+
+static struct ctl_table mem_sampling_sysctls[] = {
+	{
+		.procname       = "mem_sampling_enable",
+		.data           = NULL, /* filled in by handler */
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = proc_mem_sampling_enable,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= (int *)&mem_sampling_max_value,
+	},
+	{}
+};
+
+static void __init mem_sampling_sysctl_init(void)
+{
+	register_sysctl_init("kernel", mem_sampling_sysctls);
+}
+#else
+#define mem_sampling_sysctl_init() do { } while (0)
+#endif
+
+static void __init check_mem_sampling_enable(void)
+{
+	bool mem_sampling_default = false;
+
+	/* Parsed by setup_mem_sampling. override == 1 enables, -1 disables */
+	if (mem_sampling_override)
+		set_mem_sampling_state(mem_sampling_override == 1);
+	else
+		set_mem_sampling_state(mem_sampling_default);
+}
+
+static int __init setup_mem_sampling_enable(char *str)
+{
+	int ret = 0;
+
+	if (!str)
+		goto out;
+
+	if (!strcmp(str, "enable")) {
+		mem_sampling_override = 1;
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse mem_sampling=\n");
+
+	return ret;
+}
+__setup("mem_sampling=", setup_mem_sampling_enable);
+
+static int __init mem_sampling_init(void)
+{
+	enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type();
+	int cpu;
+
+	switch (mem_sampling_type) {
+	case MEM_SAMPLING_ARM_SPE:
+		mem_sampling_ops.sampling_start		= mm_spe_start;
+		mem_sampling_ops.sampling_stop		= mm_spe_stop;
+		mem_sampling_ops.sampling_continue	= mm_spe_continue;
+		mem_sampling_ops.sampling_decoding	= mm_spe_decoding;
+		mem_sampling_ops.mm_spe_getbuf_addr	= mm_spe_getbuf_addr;
+		mem_sampling_ops.mm_spe_getnum_record	= mm_spe_getnum_record;
+
+		break;
+
+	default:
+		pr_info("unsupport hardware pmu type(%d), disable access hint!\n",
+			mem_sampling_type);
+		set_mem_sampling_state(false);
+		return -ENODEV;
+	}
+	check_mem_sampling_enable();
+	mem_sampling_sysctl_init();
+
+	for_each_possible_cpu(cpu)
+		per_cpu(arm_spe_user, cpu) = SPE_USER_MEM_SAMPLING;
+
+	return 0;
+}
+late_initcall(mem_sampling_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 219c098b3ffa24287b6fdd2c7b70201fe4b36b07..88f0bb008efd78e619ebe43981b8f6830bfe3913 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1104,6 +1104,46 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	return err;
 }
 
+/*
+ * Migrate area pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+static int migrate_area_to_node(struct mm_struct *mm, int source, int dest,
+			   unsigned long start, unsigned long end, int flags)
+{
+	nodemask_t nmask;
+	struct vm_area_struct *vma;
+	LIST_HEAD(pagelist);
+	int err = 0;
+	struct migration_target_control mtc = {
+		.nid = dest,
+		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+	};
+
+	nodes_clear(nmask);
+	node_set(source, nmask);
+
+	/*
+	 * This does not "check" the range but isolates all pages that
+	 * need migration.  Between passing in the full user address
+	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+	 */
+	vma = find_vma(mm, 0);
+	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+	queue_pages_range(mm, start, end, &nmask,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
+
+	if (!list_empty(&pagelist)) {
+		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+				(unsigned long)&mtc, MIGRATE_SYNC, MR_DAMON_DEMOTION, NULL);
+		if (err)
+			putback_movable_pages(&pagelist);
+	}
+
+	return err;
+}
+
+
 /*
  * Move pages between the two nodesets so as to preserve the physical
  * layout as much as possible.
@@ -1209,6 +1249,112 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 
 }
 
+/*
+ * Move mm area size pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, unsigned long start,
+		     unsigned long end, int flags)
+{
+	int busy = 0;
+	int err = 0;
+	nodemask_t tmp;
+
+	lru_cache_disable();
+
+	mmap_read_lock(mm);
+
+	/*
+	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+	 * bit in 'tmp', and return that <source, dest> pair for migration.
+	 * The pair of nodemasks 'to' and 'from' define the map.
+	 *
+	 * If no pair of bits is found that way, fallback to picking some
+	 * pair of 'source' and 'dest' bits that are not the same.  If the
+	 * 'source' and 'dest' bits are the same, this represents a node
+	 * that will be migrating to itself, so no pages need move.
+	 *
+	 * If no bits are left in 'tmp', or if all remaining bits left
+	 * in 'tmp' correspond to the same bit in 'to', return false
+	 * (nothing left to migrate).
+	 *
+	 * This lets us pick a pair of nodes to migrate between, such that
+	 * if possible the dest node is not already occupied by some other
+	 * source node, minimizing the risk of overloading the memory on a
+	 * node that would happen if we migrated incoming memory to a node
+	 * before migrating outgoing memory source that same node.
+	 *
+	 * A single scan of tmp is sufficient.  As we go, we remember the
+	 * most recent <s, d> pair that moved (s != d).  If we find a pair
+	 * that not only moved, but what's better, moved to an empty slot
+	 * (d is not set in tmp), then we break out then, with that pair.
+	 * Otherwise when we finish scanning from_tmp, we at least have the
+	 * most recent <s, d> pair that moved.  If we get all the way through
+	 * the scan of tmp without finding any node that moved, much less
+	 * moved to an empty node, then there is nothing left worth migrating.
+	 */
+
+	tmp = *from;
+	while (!nodes_empty(tmp)) {
+		int s, d;
+		int source = NUMA_NO_NODE;
+		int dest = 0;
+
+		for_each_node_mask(s, tmp) {
+
+			/*
+			 * do_migrate_pages() tries to maintain the relative
+			 * node relationship of the pages established between
+			 * threads and memory areas.
+			 *
+			 * However if the number of source nodes is not equal to
+			 * the number of destination nodes we can not preserve
+			 * this node relative relationship.  In that case, skip
+			 * copying memory from a node that is in the destination
+			 * mask.
+			 *
+			 * Example: [2,3,4] -> [3,4,5] moves everything.
+			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+			 */
+
+			if ((nodes_weight(*from) != nodes_weight(*to)) &&
+						(node_isset(s, *to)))
+				continue;
+
+			d = node_remap(s, *from, *to);
+			if (s == d)
+				continue;
+
+			source = s;	/* Node moved. Memorize */
+			dest = d;
+
+			/* dest not in remaining from nodes? */
+			if (!node_isset(dest, tmp))
+				break;
+		}
+		if (source == NUMA_NO_NODE)
+			break;
+
+		node_clear(source, tmp);
+		err = migrate_area_to_node(mm, source, dest, start, end, flags);
+		if (err > 0)
+			busy += err;
+		if (err < 0)
+			break;
+	}
+	mmap_read_unlock(mm);
+
+	lru_cache_enable();
+	if (err < 0)
+		return err;
+	return busy;
+
+}
+
 /*
  * Allocate a new page for page migration based on vma policy.
  * Start by assuming the page is mapped by the same vma as contains @start.