diff --git a/include/linux/damon.h b/include/linux/damon.h
index 343132a146cf04b546a07eb6c653608d55aac9c1..b99add6c4f6c79ef280e3b1b526f6c8402a9815b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -13,11 +13,13 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 #include <linux/random.h>
+#include <linux/mm.h>
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION	PAGE_SIZE
 /* Max priority score for DAMON-based operation schemes */
 #define DAMOS_MAX_SCORE		(99)
+DECLARE_STATIC_KEY_FALSE(numa_stat_enabled_key);
 
 /* Get a random number in [l, r) */
 static inline unsigned long damon_rand(unsigned long l, unsigned long r)
@@ -25,6 +27,9 @@ static inline unsigned long damon_rand(unsigned long l, unsigned long r)
 	return l + get_random_u32_below(r - l);
 }
 
+extern struct damon_ctx **dbgfs_ctxs;
+extern int dbgfs_nr_ctxs;
+
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
  * @start:	Start address of the region (inclusive).
@@ -42,6 +47,8 @@ struct damon_addr_range {
  * @nr_accesses:	Access frequency of this region.
  * @list:		List head for siblings.
  * @age:		Age of this region.
+ * @local:		Local numa node accesses.
+ * @remote:		Remote numa node accesses.
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -57,6 +64,8 @@ struct damon_region {
 	unsigned int age;
 /* private: Internal value for age calculation. */
 	unsigned int last_nr_accesses;
+	unsigned long local;
+	unsigned long remote;
 };
 
 /**
@@ -65,6 +74,7 @@ struct damon_region {
  * @nr_regions:		Number of monitoring target regions of this target.
  * @regions_list:	Head of the monitoring target regions of this target.
  * @list:		List head for siblings.
+ * @target_lock:	Use damon_region lock to avoid race.
  *
  * Each monitoring context could have multiple targets.  For example, a context
  * for virtual memory address spaces could have multiple target processes.  The
@@ -74,8 +84,11 @@ struct damon_region {
 struct damon_target {
 	struct pid *pid;
 	unsigned int nr_regions;
+	unsigned int nr_init_regions;
+	struct damon_addr_range *init_regions;
 	struct list_head regions_list;
 	struct list_head list;
+	spinlock_t target_lock;
 };
 
 /**
@@ -549,6 +562,7 @@ struct damon_ctx {
 	struct completion kdamond_started;
 
 /* public: */
+	int need_flush;
 	struct task_struct *kdamond;
 	struct mutex kdamond_lock;
 
@@ -683,4 +697,18 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
 
 #endif	/* CONFIG_DAMON */
 
+#ifdef CONFIG_DAMON_VADDR
+/*
+ * 't->id' should be the pointer to the relevant 'struct pid' having reference
+ * count.  Caller must put the returned task, unless it is NULL.
+ */
+#define damon_get_task_struct(t) \
+(get_pid_task((struct pid *)t->pid, PIDTYPE_PID))
+
+void damon_numa_fault(int page_nid, int node_id, struct vm_fault *vmf);
+#else
+static inline void damon_numa_fault(int page_nid, int node_id, struct vm_fault *vmf) { }
+
+#endif	/* CONFIG_DAMON_VADDR */
+
 #endif	/* _DAMON_H */
diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index c79f1d4c39afe754a18d3c646a0bd262e531eb83..c976a093a4e4698afc37995913efe77960a031a7 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -23,6 +23,8 @@ TRACE_EVENT(damon_aggregated,
 		__field(unsigned long, end)
 		__field(unsigned int, nr_accesses)
 		__field(unsigned int, age)
+		__field(unsigned long, local)
+		__field(unsigned long, remote)
 	),
 
 	TP_fast_assign(
@@ -32,12 +34,15 @@ TRACE_EVENT(damon_aggregated,
 		__entry->end = r->ar.end;
 		__entry->nr_accesses = r->nr_accesses;
 		__entry->age = r->age;
+		__entry->local = r->local;
+		__entry->remote = r->remote;
 	),
 
-	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u",
+	TP_printk("target_id=%lu nr_regions=%u %lx-%lx: %u %u %lu %lu",
 			__entry->target_id, __entry->nr_regions,
 			__entry->start, __entry->end,
-			__entry->nr_accesses, __entry->age)
+			__entry->nr_accesses, __entry->age,
+			__entry->local, __entry->remote)
 );
 
 #endif /* _TRACE_DAMON_H */
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 43e4fe7ef17eb409c76f74a5d85753e9fd8a1774..af9bbe0e22fac2c5fbd75b0fb59f86b6f7eb226f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -121,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
 {
 	struct damon_region *region;
 
-	region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL);
+	region = kmem_cache_alloc(damon_region_cache, GFP_ATOMIC);
 	if (!region)
 		return NULL;
 
@@ -132,6 +132,8 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
 
 	region->age = 0;
 	region->last_nr_accesses = 0;
+	region->local = 0;
+	region->remote = 0;
 
 	return region;
 }
@@ -395,8 +397,11 @@ struct damon_target *damon_new_target(void)
 
 	t->pid = NULL;
 	t->nr_regions = 0;
+	t->nr_init_regions = 0;
+	t->init_regions = NULL;
 	INIT_LIST_HEAD(&t->regions_list);
 	INIT_LIST_HEAD(&t->list);
+	spin_lock_init(&t->target_lock);
 
 	return t;
 }
@@ -420,8 +425,11 @@ void damon_free_target(struct damon_target *t)
 {
 	struct damon_region *r, *next;
 
+	spin_lock(&t->target_lock);
 	damon_for_each_region_safe(r, next, t)
 		damon_free_region(r);
+	kfree(t->init_regions);
+	spin_unlock(&t->target_lock);
 	kfree(t);
 }
 
@@ -1127,6 +1135,8 @@ static void damon_merge_two_regions(struct damon_target *t,
 
 	l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
 			(sz_l + sz_r);
+	l->remote = (l->remote * sz_l + r->remote * sz_r) / (sz_l + sz_r);
+	l->local = (l->local * sz_l + r->local * sz_r) / (sz_l + sz_r);
 	l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
 	l->ar.end = r->ar.end;
 	damon_destroy_region(r, t);
@@ -1188,8 +1198,10 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
 	do {
 		nr_regions = 0;
 		damon_for_each_target(t, c) {
+			spin_lock(&t->target_lock);
 			damon_merge_regions_of(t, threshold, sz_limit);
 			nr_regions += damon_nr_regions(t);
+			spin_unlock(&t->target_lock);
 		}
 		threshold = max(1, threshold * 2);
 	} while (nr_regions > c->attrs.max_nr_regions &&
@@ -1216,6 +1228,8 @@ static void damon_split_region_at(struct damon_target *t,
 	new->age = r->age;
 	new->last_nr_accesses = r->last_nr_accesses;
 	new->nr_accesses = r->nr_accesses;
+	new->local = r->local;
+	new->remote = r->remote;
 
 	damon_insert_region(new, r, damon_next_region(r), t);
 }
@@ -1276,8 +1290,11 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 			nr_regions < ctx->attrs.max_nr_regions / 3)
 		nr_subregions = 3;
 
-	damon_for_each_target(t, ctx)
+	damon_for_each_target(t, ctx) {
+		spin_lock(&t->target_lock);
 		damon_split_regions_of(t, nr_subregions);
+		spin_unlock(&t->target_lock);
+	}
 
 	last_nr_regions = nr_regions;
 }
@@ -1501,8 +1518,10 @@ static int kdamond_fn(void *data)
 	}
 done:
 	damon_for_each_target(t, ctx) {
+		spin_lock(&t->target_lock);
 		damon_for_each_region_safe(r, next, t)
 			damon_destroy_region(r, t);
+		spin_unlock(&t->target_lock);
 	}
 
 	if (ctx->callback.before_terminate)
@@ -1521,6 +1540,13 @@ static int kdamond_fn(void *data)
 		running_exclusive_ctxs = false;
 	mutex_unlock(&damon_lock);
 
+	/*
+	 * when no kdamond threads are running, the
+	 * 'numa_stat_enabled_key' keeps default value.
+	 */
+	if (!nr_running_ctxs)
+		static_branch_disable(&numa_stat_enabled_key);
+
 	return 0;
 }
 
@@ -1597,6 +1623,73 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
 	return damon_set_regions(t, &addr_range, 1);
 }
 
+static struct damon_target *get_damon_target(struct task_struct *task)
+{
+	int i;
+	struct damon_target *t;
+
+	rcu_read_lock();
+	for (i = 0; i < READ_ONCE(dbgfs_nr_ctxs); i++) {
+		struct damon_ctx *ctx = rcu_dereference(dbgfs_ctxs[i]);
+
+		if (!ctx || !ctx->kdamond)
+			continue;
+		damon_for_each_target(t, dbgfs_ctxs[i]) {
+			struct task_struct *ts = damon_get_task_struct(t);
+
+			if (!ts)
+				continue;
+
+			if (ts->mm == task->mm) {
+				put_task_struct(ts);
+				rcu_read_unlock();
+				return t;
+			}
+			put_task_struct(ts);
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static struct damon_region *get_damon_region(struct damon_target *t, unsigned long addr)
+{
+	struct damon_region *r, *next;
+
+	if (!t || !addr)
+		return NULL;
+
+	damon_for_each_region_safe(r, next, t) {
+		if (r->ar.start <= addr && r->ar.end >= addr)
+			return r;
+	}
+
+	return NULL;
+}
+
+void damon_numa_fault(int page_nid, int node_id, struct vm_fault *vmf)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+
+	if (static_branch_unlikely(&numa_stat_enabled_key)
+		&& nr_online_nodes > 1) {
+		t = get_damon_target(current);
+		if (t) {
+			spin_lock(&t->target_lock);
+			r = get_damon_region(t, vmf->address);
+			if (r) {
+				if (page_nid == node_id)
+					r->local++;
+				else
+					r->remote++;
+			}
+			spin_unlock(&t->target_lock);
+		}
+	}
+}
+
 static int __init damon_init(void)
 {
 	damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index dc0ea1fc30ca5ff0b4377d91735f45be0f20ff8d..3e4a52efdf97cbcce8f82addc2848d35df763652 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -15,8 +15,8 @@
 #include <linux/page_idle.h>
 #include <linux/slab.h>
 
-static struct damon_ctx **dbgfs_ctxs;
-static int dbgfs_nr_ctxs;
+struct damon_ctx **dbgfs_ctxs;
+int dbgfs_nr_ctxs;
 static struct dentry **dbgfs_dirs;
 static DEFINE_MUTEX(damon_dbgfs_lock);
 
@@ -640,6 +640,9 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
 	damon_for_each_target(t, c) {
 		damon_for_each_region_safe(r, next, t)
 			damon_destroy_region(r, t);
+		kfree(t->init_regions);
+		t->init_regions = NULL;
+		t->nr_init_regions = 0;
 	}
 
 	while (pos < len) {
@@ -653,12 +656,37 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
 		pos += parsed;
 	}
 
+	/* Set damon_target->init_regions */
+	damon_for_each_target(t, c) {
+		unsigned int nr_regions = t->nr_regions;
+		int idx = 0;
+
+		t->nr_init_regions = nr_regions;
+		t->init_regions = kmalloc_array(nr_regions, sizeof(struct damon_addr_range),
+				GFP_KERNEL);
+		if (t->init_regions == NULL)
+			goto fail;
+		damon_for_each_region_safe(r, next, t) {
+			/* TODO: Never happen? */
+			if (idx == nr_regions) {
+				pr_alert("nr_regions overflow, init_regions already full.");
+				break;
+			}
+			t->init_regions[idx].start = r->ar.start;
+			t->init_regions[idx].end = r->ar.end;
+			idx++;
+		}
+	}
+
 	return 0;
 
 fail:
 	damon_for_each_target(t, c) {
 		damon_for_each_region_safe(r, next, t)
 			damon_destroy_region(r, t);
+		kfree(t->init_regions);
+		t->init_regions = NULL;
+		t->nr_init_regions = 0;
 	}
 	return err;
 }
@@ -718,6 +746,49 @@ static ssize_t dbgfs_kdamond_pid_read(struct file *file,
 	return len;
 }
 
+DEFINE_STATIC_KEY_FALSE(numa_stat_enabled_key);
+
+static ssize_t dbgfs_numa_stat_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	char numa_on_buf[5];
+	bool enable = static_branch_unlikely(&numa_stat_enabled_key);
+	int len;
+
+	len = scnprintf(numa_on_buf, 5, enable ? "on\n" : "off\n");
+
+	return simple_read_from_buffer(buf, count, ppos, numa_on_buf, len);
+}
+
+static ssize_t dbgfs_numa_stat_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	ssize_t ret = 0;
+	char *kbuf;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	/* Remove white space */
+	if (sscanf(kbuf, "%s", kbuf) != 1) {
+		kfree(kbuf);
+		return -EINVAL;
+	}
+
+	if (!strncmp(kbuf, "on", count))
+		static_branch_enable(&numa_stat_enabled_key);
+	else if (!strncmp(kbuf, "off", count))
+		static_branch_disable(&numa_stat_enabled_key);
+	else
+		ret = -EINVAL;
+
+	if (!ret)
+		ret = count;
+	kfree(kbuf);
+	return ret;
+}
+
 static int damon_dbgfs_open(struct inode *inode, struct file *file)
 {
 	damon_dbgfs_warn_deprecation();
@@ -756,12 +827,17 @@ static const struct file_operations kdamond_pid_fops = {
 	.read = dbgfs_kdamond_pid_read,
 };
 
+static const struct file_operations numa_stat_ops = {
+	.write = dbgfs_numa_stat_write,
+	.read = dbgfs_numa_stat_read,
+};
+
 static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
 {
 	const char * const file_names[] = {"attrs", "schemes", "target_ids",
-		"init_regions", "kdamond_pid"};
+		"init_regions", "kdamond_pid", "numa_stat"};
 	const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
-		&target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
+		&target_ids_fops, &init_regions_fops, &kdamond_pid_fops, &numa_stat_ops};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(file_names); i++)
@@ -937,10 +1013,18 @@ static int dbgfs_rm_context(char *name)
 		goto out_new_dirs;
 	}
 
-	for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
+	dbgfs_nr_ctxs--;
+	/* Prevent NUMA fault get the wrong value */
+	smp_mb();
+
+	for (i = 0, j = 0; i < dbgfs_nr_ctxs + 1; i++) {
 		if (dbgfs_dirs[i] == dir) {
+			struct damon_ctx *tmp_ctx = dbgfs_ctxs[i];
+
+			rcu_assign_pointer(dbgfs_ctxs[i], NULL);
+			synchronize_rcu();
 			debugfs_remove(dbgfs_dirs[i]);
-			dbgfs_destroy_ctx(dbgfs_ctxs[i]);
+			dbgfs_destroy_ctx(tmp_ctx);
 			continue;
 		}
 		new_dirs[j] = dbgfs_dirs[i];
@@ -952,7 +1036,6 @@ static int dbgfs_rm_context(char *name)
 
 	dbgfs_dirs = new_dirs;
 	dbgfs_ctxs = new_ctxs;
-	dbgfs_nr_ctxs--;
 
 	goto out_dput;
 
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 5764b9885e7d215cbae84bc64c2b79508c2d260d..889259f40e57d9f8caeb7d28dee1253288f8c475 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -14,6 +14,7 @@
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
+#include <linux/mm_inline.h>
 
 #include "ops-common.h"
 
@@ -22,14 +23,6 @@
 #define DAMON_MIN_REGION 1
 #endif
 
-/*
- * 't->pid' should be the pointer to the relevant 'struct pid' having reference
- * count.  Caller must put the returned task, unless it is NULL.
- */
-static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
-{
-	return get_pid_task(t->pid, PIDTYPE_PID);
-}
 
 /*
  * Get the mm_struct of the given target
@@ -259,10 +252,12 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
 		sz = DAMON_MIN_REGION;
 
 	/* Set the initial three regions of the target */
+	spin_lock(&t->target_lock);
 	for (i = 0; i < 3; i++) {
 		r = damon_new_region(regions[i].start, regions[i].end);
 		if (!r) {
 			pr_err("%d'th init region creation failed\n", i);
+			spin_unlock(&t->target_lock);
 			return;
 		}
 		damon_add_region(r, t);
@@ -270,6 +265,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
 		nr_pieces = (regions[i].end - regions[i].start) / sz;
 		damon_va_evenly_split_region(t, r, nr_pieces);
 	}
+	spin_unlock(&t->target_lock);
 }
 
 /* Initialize '->regions_list' of every target (task) */
@@ -284,6 +280,45 @@ static void damon_va_init(struct damon_ctx *ctx)
 	}
 }
 
+static void damon_va_apply_init_regions(struct damon_target *t)
+{
+	struct damon_region *r, *next, *prev;
+	unsigned int i = 0;
+
+	/* Remove all regions */
+	damon_for_each_region_safe(r, next, t) {
+		damon_destroy_region(r, t);
+	}
+
+	for (i = 0; i < t->nr_init_regions; i++) {
+		struct damon_addr_range ar = t->init_regions[i];
+
+		r = damon_new_region(ar.start, ar.end);
+		if (!r) {
+			pr_err("allocating memory failed for new region: 0x%lx - 0x%lx\n",
+					ar.start, ar.end);
+			goto fail;
+		}
+		damon_add_region(r, t);
+		if (damon_nr_regions(t) > 1) {
+			prev = damon_prev_region(r);
+			if (prev->ar.end > r->ar.start) {
+				/*
+				 * Never happen! this case had been checked during
+				 * setting init_regions.
+				 */
+				goto fail;
+			}
+		}
+	}
+	return;
+
+fail:
+	damon_for_each_region_safe(r, next, t) {
+		damon_destroy_region(r, t);
+	}
+}
+
 /*
  * Update regions for current memory mappings
  */
@@ -293,13 +328,74 @@ static void damon_va_update(struct damon_ctx *ctx)
 	struct damon_target *t;
 
 	damon_for_each_target(t, ctx) {
+		/*
+		 * If init_regions have been set, updating new target
+		 * according to init_regions.
+		 */
+		if (t->nr_init_regions) {
+			spin_lock(&t->target_lock);
+			damon_va_apply_init_regions(t);
+			spin_unlock(&t->target_lock);
+
+			continue;
+		}
 		if (damon_va_three_regions(t, three_regions))
 			continue;
+		spin_lock(&t->target_lock);
 		damon_set_regions(t, three_regions, 3);
+		spin_unlock(&t->target_lock);
 	}
 }
 
-static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
+static bool damon_pmdp_mknone(pmd_t *pmd, struct mm_walk *walk, unsigned long addr)
+{
+	bool preserve_write;
+	pmd_t entry = *pmd;
+	int *flush_enalbe = walk->private;
+
+	if (is_huge_zero_pmd(entry) || pmd_protnone(entry))
+		return false;
+
+	if (pmd_present(entry)) {
+		preserve_write = pmd_write(entry);
+		entry = pmdp_invalidate(walk->vma, addr, pmd);
+		entry = pmd_modify(entry, PAGE_NONE);
+		if (preserve_write)
+			entry = pmd_mkwrite(entry, walk->vma);
+
+		set_pmd_at(walk->mm, addr, pmd, entry);
+		++*flush_enalbe;
+		return true;
+	}
+	return false;
+}
+
+static bool damon_ptep_mknone(pte_t *pte, struct mm_walk *walk, unsigned long addr)
+{
+	pte_t oldpte, ptent;
+	bool preserve_write;
+	int *flush_enalbe = walk->private;
+
+	oldpte = *pte;
+	if (pte_protnone(oldpte))
+		return false;
+
+	if (pte_present(oldpte)) {
+		preserve_write = pte_write(oldpte);
+		oldpte = ptep_modify_prot_start(walk->vma, addr, pte);
+		ptent = pte_modify(oldpte, PAGE_NONE);
+
+		if (preserve_write)
+			ptent = pte_mkwrite(ptent, walk->vma);
+
+		ptep_modify_prot_commit(walk->vma, addr, pte, oldpte, ptent);
+		++*flush_enalbe;
+		return true;
+	}
+	return false;
+}
+
+static int damon_va_pmd_entry(pmd_t *pmd, unsigned long addr,
 		unsigned long next, struct mm_walk *walk)
 {
 	pte_t *pte;
@@ -317,6 +413,9 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 
 		if (pmd_trans_huge(pmde)) {
 			damon_pmdp_mkold(pmd, walk->vma, addr);
+			if (static_branch_unlikely(&numa_stat_enabled_key) &&
+					nr_online_nodes > 1)
+				damon_pmdp_mknone(pmd, walk, addr);
 			spin_unlock(ptl);
 			return 0;
 		}
@@ -328,10 +427,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 		walk->action = ACTION_AGAIN;
 		return 0;
 	}
-	if (!pte_present(ptep_get(pte)))
-		goto out;
+	if (!pte_present(*pte)) {
+		pte_unmap_unlock(pte, ptl);
+		return 0;
+	}
 	damon_ptep_mkold(pte, walk->vma, addr);
-out:
+	if (static_branch_unlikely(&numa_stat_enabled_key) &&
+			nr_online_nodes > 1)
+		damon_ptep_mknone(pte, walk, addr);
 	pte_unmap_unlock(pte, ptl);
 	return 0;
 }
@@ -389,16 +492,17 @@ static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
 #define damon_mkold_hugetlb_entry NULL
 #endif /* CONFIG_HUGETLB_PAGE */
 
-static const struct mm_walk_ops damon_mkold_ops = {
-	.pmd_entry = damon_mkold_pmd_entry,
+static const struct mm_walk_ops damon_va_ops = {
+	.pmd_entry = damon_va_pmd_entry,
 	.hugetlb_entry = damon_mkold_hugetlb_entry,
 	.walk_lock = PGWALK_RDLOCK,
 };
 
-static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
+static void damon_va_check(struct damon_ctx *ctx, struct mm_struct *mm,
+			   unsigned long addr)
 {
 	mmap_read_lock(mm);
-	walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
+	walk_page_range(mm, addr, addr + 1, &damon_va_ops, &ctx->need_flush);
 	mmap_read_unlock(mm);
 }
 
@@ -406,12 +510,12 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
  * Functions for the access checking of the regions
  */
 
-static void __damon_va_prepare_access_check(struct mm_struct *mm,
+static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm,
 					struct damon_region *r)
 {
 	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
 
-	damon_va_mkold(mm, r->sampling_addr);
+	damon_va_check(ctx, mm, r->sampling_addr);
 }
 
 static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
@@ -421,11 +525,33 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 	struct damon_region *r;
 
 	damon_for_each_target(t, ctx) {
+		ctx->need_flush = 0;
 		mm = damon_get_mm(t);
 		if (!mm)
 			continue;
+
+		if (static_branch_unlikely(&numa_stat_enabled_key) &&
+		    nr_online_nodes > 1) {
+			inc_tlb_flush_pending(mm);
+			ctx->need_flush = 1;
+		}
+
 		damon_for_each_region(r, t)
-			__damon_va_prepare_access_check(mm, r);
+			__damon_va_prepare_access_check(ctx, mm, r);
+
+		/*
+		 * We have to make sure that in some concurrent scenarios,
+		 * one core is doing numa sampling, but anthor core turns off it,
+		 * in this case, if we still use variable "numa_stat_enabled_key"
+		 * to check if it needs to be flushed, it will cause the flush_tlb_mm()
+		 * not be called.
+		 */
+		if (ctx->need_flush > 1)
+			flush_tlb_mm(mm);
+
+		if (ctx->need_flush)
+			dec_tlb_flush_pending(mm);
+
 		mmput(mm);
 	}
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 92556dfe845166765b1d5d0efed85d6cca3ed060..5728934935a8f4edb658e0ffb1bdf35e743a5baf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -40,6 +40,7 @@
 #include <linux/memory-tiers.h>
 #include <linux/compat.h>
 #include <linux/page_dup.h>
+#include <linux/damon.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -2170,6 +2171,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	if (node_is_toptier(nid))
 		last_cpupid = folio_last_cpupid(folio);
 	target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
+	damon_numa_fault(nid, numa_node_id(), vmf);
 	if (target_nid == NUMA_NO_NODE) {
 		folio_put(folio);
 		goto out_map;
diff --git a/mm/memory.c b/mm/memory.c
index 1870f34a79816b7f03fc7c7e26e846b4dfe95d90..e2b0625b9b96b807123fe77fe5454c79d9d5300b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -79,6 +79,7 @@
 #include <linux/zswap.h>
 #include <linux/sched/sysctl.h>
 #include <linux/page_dup.h>
+#include <linux/damon.h>
 
 #include <trace/events/kmem.h>
 
@@ -5956,6 +5957,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	else
 		last_cpupid = folio_last_cpupid(folio);
 	target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
+	damon_numa_fault(nid, numa_node_id(), vmf);
 	if (target_nid == NUMA_NO_NODE) {
 		folio_put(folio);
 		goto out_map;