diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_GROUP_BALANCER b/anolis/configs/L1-RECOMMEND/default/CONFIG_GROUP_BALANCER
new file mode 100644
index 0000000000000000000000000000000000000000..2f9cf4ab7271ca09197ea2182a263019ff59a07d
--- /dev/null
+++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_GROUP_BALANCER
@@ -0,0 +1 @@
+CONFIG_GROUP_BALANCER=y
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b5b974066a4546ee1d4b38683536b65b2dc2abd..217a58817641fec6215086a68f3387fdebe2ef30 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1634,6 +1634,10 @@ struct task_struct {
 	unsigned long wait_moment;
 	bool				proxy_exec;
 
+#ifdef CONFIG_GROUP_BALANCER
+	struct cpumask			cpus_allowed_alt;
+	int				soft_cpus_version;
+#endif
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
@@ -2677,4 +2681,5 @@ static inline bool jbd2_proxy_exec_disabled(void)
 {
 	return !static_branch_unlikely(&__jbd2_proxy_exec_enabled);
 }
+extern void sched_task_release(struct task_struct *p);
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 1c45773304fce6b769d4e28b87cd681d63bf309b..0d226e6af46d5e4d02846f96eb08d7e6816e4553 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -35,4 +35,10 @@ extern int sched_acpu_enable_handler(struct ctl_table *table, int write,
 				     void __user *buffer, size_t *lenp,
 				     loff_t *ppos);
 #endif
+#ifdef CONFIG_GROUP_BALANCER
+extern unsigned int sysctl_sched_group_balancer_enabled;
+extern int sched_group_balancer_enable_handler(struct ctl_table *table, int write,
+					       void __user *buffer, size_t *lenp,
+					       loff_t *ppos);
+#endif
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f5dcdc7bdc91d612f8fc4c7b40135..2b04e8cf89e57f552c016eb5affabdf38331a9b5 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -102,4 +102,5 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 
+#define GROUP_BALANCER_MAGIC	0x26262626
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/init/Kconfig b/init/Kconfig
index 1c39589758f8d9b9eee4094b207167dbd05561ad..83d8cee6e92a1d7133803bacfce1fa255ba0c615 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1023,6 +1023,15 @@ config CFS_BANDWIDTH
 	  restriction.
 	  See Documentation/scheduler/sched-bwc.rst for more information.
 
+config GROUP_BALANCER
+	bool "Group balancing for SCHED_OTHER"
+	depends on FAIR_GROUP_SCHED && SMP && CFS_BANDWIDTH
+	default n
+	help
+	  This feature schedule task groups as a whole to achieve better
+	  locality. It use a soft cpu bind method which offer a dynamic
+	  way to restrict allowed CPUS for tasks in the same cgroup.
+
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on CGROUP_SCHED
diff --git a/kernel/exit.c b/kernel/exit.c
index 96ac2a9452d9cfe273edf6767afedf0685a6a834..074d795478237f51d97aba73fbfc56c3ad4873f1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -279,6 +279,7 @@ void release_task(struct task_struct *p)
 	}
 
 	write_unlock_irq(&tasklist_lock);
+	sched_task_release(p);
 	seccomp_filter_release(p);
 	proc_flush_pid(thread_pid);
 	put_pid(thread_pid);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 976092b7bd4520ebc1a607734520ac342585a120..33762cb183425e6609803fa1ffb5572fdcbdb24b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -32,3 +32,5 @@ obj-y += core.o
 obj-y += fair.o
 obj-y += build_policy.o
 obj-y += build_utility.o
+
+obj-$(CONFIG_GROUP_BALANCER) += group_balancer.o
\ No newline at end of file
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5627029cb9bfa61effb6546fe7256c0f2dca14e2..ddc84dfa84c154d836c531fdb81d198f12ae2a4a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -165,6 +165,58 @@ unsigned int sysctl_sched_acpu_enabled;
 unsigned int sysctl_sched_cfs_bw_burst_onset_percent;
 #endif
 
+#ifdef CONFIG_GROUP_BALANCER
+DEFINE_STATIC_KEY_FALSE(__group_balancer_enabled);
+unsigned int sysctl_sched_group_balancer_enabled;
+DEFINE_RWLOCK(group_balancer_lock);
+
+static void group_balancer_enable(void)
+{
+	sched_init_group_balancer_sched_domains();
+	static_branch_enable(&__group_balancer_enabled);
+}
+
+static void group_balancer_disable(void)
+{
+	static_branch_disable(&__group_balancer_enabled);
+	sched_clear_group_balancer_sched_domains();
+}
+
+bool group_balancer_enabled(void)
+{
+	return static_branch_unlikely(&__group_balancer_enabled);
+}
+
+int sched_group_balancer_enable_handler(struct ctl_table *table, int write,
+					void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	unsigned int old, new;
+
+	if (!write) {
+		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+		return ret;
+	}
+
+	old = sysctl_sched_group_balancer_enabled;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	new = sysctl_sched_group_balancer_enabled;
+	if (!ret && (old != new)) {
+		if (new)
+			/*
+			 * Even if failed to build group balancer sched domains,
+			 * group balancer should be enabled, so that we can use
+			 * the cpu.soft_cpus interface.
+			 */
+			group_balancer_enable();
+		else
+			group_balancer_disable();
+	}
+
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
@@ -2537,7 +2589,7 @@ static int migration_cpu_stop(void *data)
 		 * ->pi_lock, so the allowed mask is stable - if it got
 		 * somewhere allowed, we're done.
 		 */
-		if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+		if (cpumask_test_cpu(task_cpu(p), task_allowed_cpu(p))) {
 			p->migration_pending = NULL;
 			complete = true;
 			goto out;
@@ -2674,6 +2726,10 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
 	if (running)
 		set_next_task(rq, p);
+#ifdef CONFIG_GROUP_BALANCER
+	/* Once p->cpus_ptr changed, keep soft_cpus_version negative before we sync soft cpus. */
+	p->soft_cpus_version = -1;
+#endif
 }
 
 /*
@@ -3363,10 +3419,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
+	if (!cpumask_test_cpu(arg.dst_cpu, task_allowed_cpu(arg.src_task)))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
+	if (!cpumask_test_cpu(arg.src_cpu, task_allowed_cpu(arg.dst_task)))
 		goto out;
 
 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -3446,7 +3502,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 
 	for (;;) {
 		/* Any allowed, online CPU? */
-		for_each_cpu(dest_cpu, p->cpus_ptr) {
+		for_each_cpu(dest_cpu, task_allowed_cpu(p)) {
 			if (!is_cpu_allowed(p, dest_cpu))
 				continue;
 
@@ -3505,7 +3561,7 @@ int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
 		cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
 		*wake_flags |= WF_RQ_SELECTED;
 	} else {
-		cpu = cpumask_any(p->cpus_ptr);
+		cpu = cpumask_any(task_allowed_cpu(p));
 	}
 
 	/*
@@ -3859,7 +3915,7 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 		return false;
 
 	/* Ensure the task will still be allowed to run on the CPU. */
-	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+	if (!cpumask_test_cpu(cpu, task_allowed_cpu(p)))
 		return false;
 
 	/*
@@ -4701,6 +4757,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
+#endif
+#ifdef CONFIG_GROUP_BALANCER
+	p->soft_cpus_version = -1;
 #endif
 	return 0;
 }
@@ -7851,7 +7910,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 	if (curr_cpu == target_cpu)
 		return 0;
 
-	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
+	if (!cpumask_test_cpu(target_cpu, task_allowed_cpu(p)))
 		return -EINVAL;
 
 	/* TODO: This is not properly updating schedstats */
@@ -8385,6 +8444,11 @@ void __init sched_init_smp(void)
 	sched_init_domains(cpu_active_mask);
 	mutex_unlock(&sched_domains_mutex);
 
+#ifdef CONFIG_GROUP_BALANCER
+	cpumask_copy(&root_task_group.soft_cpus_allowed, cpu_online_mask);
+	root_task_group.soft_cpus_allowed_ptr = &root_task_group.soft_cpus_allowed;
+#endif
+
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
 		BUG();
@@ -8438,6 +8502,10 @@ LIST_HEAD(task_groups);
 static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
+#ifdef CONFIG_GROUP_BALANCER
+DECLARE_PER_CPU(cpumask_var_t, group_balancer_mask);
+#endif
+
 void __init sched_init(void)
 {
 	unsigned long ptr = 0;
@@ -8488,7 +8556,12 @@ void __init sched_init(void)
 
 #endif /* CONFIG_RT_GROUP_SCHED */
 	}
-
+#ifdef CONFIG_GROUP_BALANCER
+	root_task_group.specs_ratio = -1;
+	root_task_group.group_balancer = 0;
+	root_task_group.soft_cpus_version = 0;
+	root_task_group.gb_sd = NULL;
+#endif
 	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
 
 #ifdef CONFIG_SMP
@@ -8602,8 +8675,15 @@ void __init sched_init(void)
 		rq->core_sibidle_start_task = 0;
 
 		rq->core_cookie = 0UL;
+#endif
+#ifdef CONFIG_GROUP_BALANCER
+		rq->gb_sd = NULL;
 #endif
 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+#ifdef CONFIG_GROUP_BALANCER
+		zalloc_cpumask_var_node(
+			&per_cpu(group_balancer_mask, i), GFP_KERNEL, cpu_to_node(i));
+#endif
 	}
 
 	set_load_weight(&init_task, false);
@@ -8965,6 +9045,24 @@ struct task_group *sched_create_group(struct task_group *parent)
 
 #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_CFS_BANDWIDTH)
 	tg->ht_ratio = 100;
+#endif
+#ifdef CONFIG_GROUP_BALANCER
+	cpumask_copy(&tg->soft_cpus_allowed, &parent->soft_cpus_allowed);
+	if (group_balancer_enabled()) {
+		read_lock(&group_balancer_lock);
+		if (parent->soft_cpus_allowed_ptr != &parent->soft_cpus_allowed ||
+		    parent->group_balancer)
+			tg->soft_cpus_allowed_ptr = parent->soft_cpus_allowed_ptr;
+		else
+			tg->soft_cpus_allowed_ptr = &tg->soft_cpus_allowed;
+		read_unlock(&group_balancer_lock);
+	} else {
+		tg->soft_cpus_allowed_ptr = &tg->soft_cpus_allowed;
+	}
+	tg->group_balancer = 0;
+	tg->soft_cpus_version = 0;
+	tg->gb_sd = NULL;
+	raw_spin_lock_init(&tg->gb_lock);
 #endif
 	return tg;
 
@@ -9059,6 +9157,10 @@ static void sched_change_group(struct task_struct *tsk)
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
+#ifdef CONFIG_GROUP_BALANCER
+	/* Once tsk changed task group, keep soft_cpus_version negative before we sync soft cpus. */
+	tsk->soft_cpus_version = -1;
+#endif
 }
 
 /*
@@ -9795,6 +9897,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 		}
 	}
 	cfs_b->hierarchical_quota = quota;
+	tg_set_specs_ratio(tg);
 
 	return 0;
 }
@@ -9962,6 +10065,144 @@ static u64 cpu_ht_ratio_read(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_GROUP_BALANCER
+static int cpu_soft_cpus_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	seq_printf(sf, "%*pbl\n", cpumask_pr_args(tg->soft_cpus_allowed_ptr));
+
+	return 0;
+}
+
+static ssize_t cpu_soft_cpus_write(struct kernfs_open_file *of,
+				   char *buf, size_t nbytes, loff_t off)
+{
+	struct task_group *tg = css_tg(of_css(of));
+	cpumask_t tmp_soft_cpus_allowed;
+	cpumask_t *tg_soft_cpus_allowed;
+	int retval;
+
+	if (tg == &root_task_group)
+		return -EACCES;
+
+	/*
+	 * If any ancestor of tg(or itself) has already enabled group_balancer,
+	 * it's not allowed to edit its soft_cpus_allowed.
+	 */
+	if (tg->soft_cpus_allowed_ptr != &tg->soft_cpus_allowed || tg->group_balancer)
+		return -EACCES;
+
+	if (!*buf) {
+		cpumask_clear(&tmp_soft_cpus_allowed);
+	} else {
+		retval = cpulist_parse(buf, &tmp_soft_cpus_allowed);
+		if (retval < 0)
+			return retval;
+	}
+
+	if (!cpumask_subset(&tmp_soft_cpus_allowed, cpu_online_mask))
+		return -EINVAL;
+
+	if (cpumask_empty(&tmp_soft_cpus_allowed))
+		return -ENOSPC;
+
+	tg_soft_cpus_allowed = &tg->soft_cpus_allowed;
+	if (!cpumask_equal(tg_soft_cpus_allowed, &tmp_soft_cpus_allowed)) {
+		cpumask_copy(tg_soft_cpus_allowed, &tmp_soft_cpus_allowed);
+		tg_inc_soft_cpus_version(tg);
+	}
+
+	return nbytes;
+}
+
+static u64 cpu_group_balancer_read_u64(struct cgroup_subsys_state *css,
+				       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->group_balancer;
+}
+
+static int tg_validate_group_balancer_down(struct task_group *tg, void *data)
+{
+	if (tg->group_balancer)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * There is only one task group allowed to enable group balancer in the path from
+ * root_task_group to a certion leaf task group.
+ */
+static int validate_group_balancer(struct task_group *tg)
+{
+	int retval = 0;
+
+	rcu_read_lock();
+	retval = walk_tg_tree_from(tg, tg_validate_group_balancer_down,
+				   tg_nop, NULL);
+	if (retval)
+		goto out;
+
+	for (; tg != &root_task_group; tg = tg->parent) {
+		if (tg->group_balancer) {
+			retval = -EINVAL;
+			break;
+		}
+	}
+out:
+	rcu_read_unlock();
+	return retval;
+}
+
+static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css,
+					struct cftype *cftype, u64 new)
+{
+	struct task_group *tg = css_tg(css);
+	bool old;
+	int retval = 0;
+
+	if (!group_balancer_enabled())
+		return -EPERM;
+
+	if (tg == &root_task_group || task_group_is_autogroup(tg))
+		return -EACCES;
+
+	if (new > 1)
+		return -EINVAL;
+
+	write_lock(&group_balancer_lock);
+	old = tg->group_balancer;
+
+	if (old == new)
+		goto out;
+
+	if (new) {
+		retval = validate_group_balancer(tg);
+		if (retval)
+			goto out;
+		retval = attach_tg_to_group_balancer_sched_domain(tg, NULL, true);
+		if (retval)
+			goto out;
+	} else {
+		detach_tg_from_group_balancer_sched_domain(tg, true);
+	}
+	tg->group_balancer = new;
+out:
+	write_unlock(&group_balancer_lock);
+	return retval;
+}
+
+static s64 cpu_specs_ratio_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->specs_ratio;
+}
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_GROUP_SCHED_WEIGHT
 	{
@@ -10043,6 +10284,26 @@ static struct cftype cpu_legacy_files[] = {
 		.read_u64 = cpu_ht_ratio_read,
 		.write_u64 = cpu_ht_ratio_write,
 	},
+#endif
+#ifdef CONFIG_GROUP_BALANCER
+	{
+		.name = "soft_cpus",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_soft_cpus_show,
+		.write = cpu_soft_cpus_write,
+		.max_write_len = (100U + 6 * 1024),
+	},
+	{
+		.name = "group_balancer",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_group_balancer_read_u64,
+		.write_u64 = cpu_group_balancer_write_u64,
+	},
+	{
+		.name = "specs_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_specs_ratio_read_s64,
+	},
 #endif
 	{ }	/* Terminate */
 };
@@ -10607,6 +10868,26 @@ static struct cftype cpu_files[] = {
 		.write_u64 = sched_lat_stat_write,
 		.seq_show = sched_lat_stat_show
 	},
+#endif
+#ifdef CONFIG_GROUP_BALANCER
+	{
+		.name = "soft_cpus",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_soft_cpus_show,
+		.write = cpu_soft_cpus_write,
+		.max_write_len = (100U + 6 * 1024),
+	},
+	{
+		.name = "group_balancer",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_group_balancer_read_u64,
+		.write_u64 = cpu_group_balancer_write_u64,
+	},
+	{
+		.name = "specs_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_specs_ratio_read_s64,
+	},
 #endif
 	{ }	/* terminate */
 };
@@ -10695,6 +10976,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
         trace_sched_update_nr_running_tp(rq, count);
 }
 
+/* A hook point for hotfix to release reserve memory used for scheduler. */
+void sched_task_release(struct task_struct *p)
+{
+}
+
 #ifdef CONFIG_SCHED_MM_CID
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5161000eec0a6d42de753ce034b26ee0529c7f86..18c3024496fd78d9a84a9e2f4f33627d2aa5eabd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2228,7 +2228,7 @@ static void update_numa_stats(struct task_numa_env *env,
 
 		if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
 			if (READ_ONCE(rq->numa_migrate_on) ||
-			    !cpumask_test_cpu(cpu, env->p->cpus_ptr))
+			    !cpumask_test_cpu(cpu, task_allowed_cpu(env->p)))
 				continue;
 
 			if (ns->idle_cpu == -1)
@@ -2260,7 +2260,7 @@ static void task_numa_assign(struct task_numa_env *env,
 		/* Find alternative idle CPU. */
 		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
 			if (cpu == env->best_cpu || !idle_cpu(cpu) ||
-			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+			    !cpumask_test_cpu(cpu, task_allowed_cpu(env->p))) {
 				continue;
 			}
 
@@ -2373,7 +2373,7 @@ static bool task_numa_compare(struct task_numa_env *env,
 	}
 
 	/* Skip this swap candidate if cannot move to the source cpu. */
-	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
+	if (!cpumask_test_cpu(env->src_cpu, task_allowed_cpu(cur)))
 		goto unlock;
 
 	/*
@@ -2572,7 +2572,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
-		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
+		if (!cpumask_test_cpu(cpu, task_allowed_cpu(env->p)))
 			continue;
 
 		env->dst_cpu = cpu;
@@ -6872,6 +6872,13 @@ static inline bool cpu_overutilized(int cpu)
 	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
 }
 
+#ifdef CONFIG_GROUP_BALANCER
+bool gb_cpu_overutilized(int cpu)
+{
+	return cpu_overutilized(cpu);
+}
+#endif
+
 static inline void set_rd_overutilized_status(struct root_domain *rd,
 					      unsigned int status)
 {
@@ -7235,6 +7242,16 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
+#ifdef CONFIG_GROUP_BALANCER
+	/*
+	 * group_balancer_mask is used to mark which cpus have been balanced
+	 * to this cpu during this load balance. If the src cpu hasn't been
+	 * marked, we will balance the group balancer sched domains of src
+	 * cpu and this cpu, and then mark the cpus of the src group balancer
+	 * sched domain as balanced.
+	 */
+DEFINE_PER_CPU(cpumask_var_t, group_balancer_mask);
+#endif
 
 #ifdef CONFIG_NO_HZ_COMMON
 
@@ -7487,7 +7504,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 		return cpumask_first(sched_group_span(group));
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+	for_each_cpu_and(i, sched_group_span(group), task_allowed_cpu(p)) {
 		struct rq *rq = cpu_rq(i);
 
 		if (!sched_core_cookie_match(rq, p))
@@ -7534,7 +7551,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = cpu;
 
-	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
+	if (!cpumask_intersects(sched_domain_span(sd), task_allowed_cpu(p)))
 		return prev_cpu;
 
 	/*
@@ -7684,7 +7701,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 {
 	int cpu;
 
-	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+	for_each_cpu_and(cpu, cpu_smt_mask(target), task_allowed_cpu(p)) {
 		if (cpu == target)
 			continue;
 		/*
@@ -7738,7 +7755,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	struct sched_domain *this_sd = NULL;
 	u64 time = 0;
 
-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+	cpumask_and(cpus, sched_domain_span(sd), task_allowed_cpu(p));
 
 	if (sched_feat(SIS_PROP) && !has_idle_core) {
 		u64 avg_cost, avg_idle, span_avg;
@@ -7854,7 +7871,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	struct cpumask *cpus;
 
 	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+	cpumask_and(cpus, sched_domain_span(sd), task_allowed_cpu(p));
 
 	task_util = task_util_est(p);
 	util_min = uclamp_eff_value(p, UCLAMP_MIN);
@@ -7975,7 +7992,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
-	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
+	    cpumask_test_cpu(recent_used_cpu, task_allowed_cpu(p)) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -8423,7 +8440,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 				continue;
 
-			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+			if (!cpumask_test_cpu(cpu, task_allowed_cpu(p)))
 				continue;
 
 			util = cpu_util(cpu, p, cpu, 0);
@@ -8572,7 +8589,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		record_wakee(p);
 
 		if ((wake_flags & WF_CURRENT_CPU) &&
-		    cpumask_test_cpu(cpu, p->cpus_ptr))
+		    cpumask_test_cpu(cpu, task_allowed_cpu(p)))
 			return cpu;
 
 		if (sched_energy_enabled()) {
@@ -8582,7 +8599,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 			new_cpu = prev_cpu;
 		}
 
-		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
+		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, task_allowed_cpu(p));
 	}
 
 	rcu_read_lock();
@@ -9091,8 +9108,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
-enum fbq_type { regular, remote, all };
-
 /*
  * 'group_type' describes the group of CPUs at the moment of load balancing.
  *
@@ -9136,46 +9151,12 @@ enum group_type {
 	group_overloaded
 };
 
-enum migration_type {
-	migrate_load = 0,
-	migrate_util,
-	migrate_task,
-	migrate_misfit
-};
-
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
 #define LBF_ACTIVE_LB	0x10
 
-struct lb_env {
-	struct sched_domain	*sd;
-
-	struct rq		*src_rq;
-	int			src_cpu;
-
-	int			dst_cpu;
-	struct rq		*dst_rq;
-
-	struct cpumask		*dst_grpmask;
-	int			new_dst_cpu;
-	enum cpu_idle_type	idle;
-	long			imbalance;
-	/* The set of CPUs under consideration for load-balancing */
-	struct cpumask		*cpus;
-
-	unsigned int		flags;
-
-	unsigned int		loop;
-	unsigned int		loop_break;
-	unsigned int		loop_max;
-
-	enum fbq_type		fbq_type;
-	enum migration_type	migration_type;
-	struct list_head	tasks;
-};
-
 /*
  * Is this task likely cache-hot:
  */
@@ -9344,7 +9325,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (kthread_is_per_cpu(p))
 		return 0;
 
-	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
+	if (!cpumask_test_cpu(env->dst_cpu, task_allowed_cpu(p))) {
 		int cpu;
 
 		schedstat_inc(p->stats.nr_failed_migrations_affine);
@@ -9367,7 +9348,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 		/* Prevent to re-select dst_cpu via env's CPUs: */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
+			if (cpumask_test_cpu(cpu, task_allowed_cpu(p))) {
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
@@ -9795,6 +9776,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 	}
 }
 
+#ifdef CONFIG_GROUP_BALANCER
+unsigned long cfs_h_load(struct cfs_rq *cfs_rq)
+{
+	update_cfs_rq_h_load(cfs_rq);
+	return cfs_rq->h_load;
+}
+#endif
+
 static unsigned long task_h_load(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
@@ -10745,7 +10734,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_span(group),
-					p->cpus_ptr))
+					task_allowed_cpu(p)))
 			continue;
 
 		/* Skip over this group if no cookie matched */
@@ -10867,7 +10856,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			if (p->nr_cpus_allowed != NR_CPUS) {
 				struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 
-				cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+				cpumask_and(cpus, sched_group_span(local), task_allowed_cpu(p));
 				imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
 			}
 
@@ -11728,6 +11717,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	/* Clear this flag as soon as we find a pullable task */
 	env.flags |= LBF_ALL_PINNED;
 	if (busiest->nr_running > 1) {
+		gb_load_balance(&env);
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
@@ -12113,7 +12103,12 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 	int update_next_balance = 0;
 	int need_serialize, need_decay = 0;
 	u64 max_cost = 0;
+#ifdef CONFIG_GROUP_BALANCER
+	struct cpumask *gb_mask = this_cpu_cpumask_var_ptr(group_balancer_mask);
 
+	if (group_balancer_enabled())
+		cpumask_clear(gb_mask);
+#endif
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		/*
@@ -12697,7 +12692,12 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	u64 t0, t1, curr_cost = 0;
 	struct sched_domain *sd;
 	int pulled_task = 0;
+#ifdef CONFIG_GROUP_BALANCER
+	struct cpumask *gb_mask = this_cpu_cpumask_var_ptr(group_balancer_mask);
 
+	if (group_balancer_enabled())
+		cpumask_clear(gb_mask);
+#endif
 	update_misfit_status(NULL, this_rq);
 
 	/*
@@ -13355,6 +13355,26 @@ void free_fair_sched_group(struct task_group *tg)
 	kfree(tg->se);
 }
 
+#ifdef CONFIG_GROUP_BALANCER
+void tg_set_specs_ratio(struct task_group *tg)
+{
+	u64 quota = tg_cfs_bandwidth(tg)->hierarchical_quota;
+	u64 specs_ratio;
+
+	if (quota == RUNTIME_INF) {
+		tg->specs_ratio = -1;
+		return;
+	}
+
+	specs_ratio = quota / ((1 << BW_SHIFT) / 100);
+
+	/* If specs_ratio is bigger than INT_MAX, set specs_ratio -1. */
+	tg->specs_ratio = specs_ratio > INT_MAX ? -1 : specs_ratio;
+	if (tg->group_balancer)
+		tg_specs_change(tg);
+}
+#endif
+
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct sched_entity *se;
@@ -13371,6 +13391,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	tg->shares = NICE_0_LOAD;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
+	tg_set_specs_ratio(tg);
 
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
new file mode 100644
index 0000000000000000000000000000000000000000..d1471ae5f7cb9da737ff1234586a4b1dc6441359
--- /dev/null
+++ b/kernel/sched/group_balancer.c
@@ -0,0 +1,1890 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Group Balancer
+ *
+ * Group Balancer sched domains define and build
+ * Copyright (C) 2024 Alibaba Group, Inc., Cruz Zhao <CruzZhao@linux.alibaba.com>
+ */
+#include "sched.h"
+#include "autogroup.h"
+#include <linux/log2.h>
+#include <linux/fs_context.h>
+#include <linux/sched/isolation.h>
+
+struct gb_lb_env {
+	struct group_balancer_sched_domain	*src;
+	struct group_balancer_sched_domain	*dst;
+	struct group_balancer_sched_domain	*gb_sd;
+	long					imbalance;
+	unsigned long				nr_balance_failed;
+	enum migration_type			migration_type;
+	struct rb_root				task_groups;
+
+	CK_KABI_RESERVE(1)
+	CK_KABI_RESERVE(2)
+	CK_KABI_RESERVE(3)
+	CK_KABI_RESERVE(4)
+};
+
+DECLARE_PER_CPU(cpumask_var_t, group_balancer_mask);
+
+struct group_balancer_sched_domain {
+	struct group_balancer_sched_domain		*parent;
+	struct list_head				child;
+	struct list_head				sibling;
+	struct list_head				topology_level_sibling;
+	struct list_head				size_level_sibling;
+	unsigned long					gb_flags;
+	char						*topology_name;
+	unsigned int					span_weight;
+	unsigned int					nr_children;
+	/* If free_tg_specs is less than zero, the gb_sd is overloaded. */
+	int						free_tg_specs;
+	unsigned int					depth;
+	raw_spinlock_t					lock;
+	struct rb_root					task_groups;
+	struct kernfs_node				*kn;
+	unsigned long					lower_interval;
+	CK_KABI_RESERVE(1)
+	CK_KABI_RESERVE(2)
+	CK_KABI_RESERVE(3)
+	CK_KABI_RESERVE(4)
+	CK_KABI_RESERVE(5)
+	CK_KABI_RESERVE(6)
+	CK_KABI_RESERVE(7)
+	CK_KABI_RESERVE(8)
+	CK_KABI_RESERVE(9)
+	CK_KABI_RESERVE(10)
+	CK_KABI_RESERVE(11)
+	CK_KABI_RESERVE(12)
+	CK_KABI_RESERVE(13)
+	CK_KABI_RESERVE(14)
+	CK_KABI_RESERVE(15)
+	CK_KABI_RESERVE(16)
+	unsigned long					span[];
+};
+
+/* The topology that group balancer cares about. */
+enum GROUP_BALANCER_TOPOLOGY {
+	GROUP_BALANCER_ROOT,
+	GROUP_BALANCER_SOCKET,
+#ifdef CONFIG_NUMA
+	GROUP_BALANCER_NUMA,
+#endif
+	GROUP_BALANCER_DIE,
+	GROUP_BALANCER_LLC,
+#ifdef CONFIG_SCHED_MC
+	GROUP_BALANCER_MC,
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+	GROUP_BALANCER_CLUSTER,
+#endif
+#ifdef CONFIG_SCHED_SMT
+	GROUP_BALANCER_SMT,
+#endif
+	NR_GROUP_BALANCER_TOPOLOGY,
+};
+
+enum GROUP_BALANCER_TOPOLOGY_FLAGS {
+	GROUP_BALANCER_ROOT_FLAG	= BIT(GROUP_BALANCER_ROOT),
+	GROUP_BALANCER_SOCKET_FLAG	= BIT(GROUP_BALANCER_SOCKET),
+#ifdef CONFIG_NUMA
+	GROUP_BALANCER_NUMA_FLAG	= BIT(GROUP_BALANCER_NUMA),
+#endif
+	GROUP_BALANCER_DIE_FLAG		= BIT(GROUP_BALANCER_DIE),
+	GROUP_BALANCER_LLC_FLAG		= BIT(GROUP_BALANCER_LLC),
+#ifdef CONFIG_SCHED_MC
+	GROUP_BALANCER_MC_FLAG		= BIT(GROUP_BALANCER_MC),
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+	GROUP_BALANCER_CLUSTER_FLAG	= BIT(GROUP_BALANCER_CLUSTER),
+#endif
+#ifdef CONFIG_SCHED_SMT
+	GROUP_BALANCER_SMT_FLAG		= BIT(GROUP_BALANCER_SMT),
+#endif
+};
+
+struct group_balancer_topology_level {
+	sched_domain_mask_f	mask;
+	sched_domain_flags_f	sd_flags;
+	unsigned long		gb_flags;
+	char			*topology_name;
+	struct list_head	domains;
+	bool			skip;
+	CK_KABI_RESERVE(1)
+	CK_KABI_RESERVE(2)
+	CK_KABI_RESERVE(3)
+	CK_KABI_RESERVE(4)
+};
+
+struct group_balancer_size_level {
+	int			size;
+	/* Use list temporarily, we will change to use rb_tree later.*/
+	struct list_head	domains;
+	CK_KABI_RESERVE(1)
+	CK_KABI_RESERVE(2)
+	CK_KABI_RESERVE(3)
+	CK_KABI_RESERVE(4)
+};
+
+LIST_HEAD(group_balancer_sched_domains);
+
+DEFINE_RWLOCK(group_balancer_sched_domain_lock);
+
+struct cpumask root_cpumask;
+
+static struct kernfs_root *group_balancer_fs_root;
+static struct kernfs_node *group_balancer_fs_root_kn;
+struct group_balancer_fs_context {
+	struct kernfs_fs_context	kfc;
+	void				*tmp;
+	CK_KABI_RESERVE(1)
+	CK_KABI_RESERVE(2)
+	CK_KABI_RESERVE(3)
+	CK_KABI_RESERVE(4)
+};
+
+struct gftype {
+	char			*name;
+	umode_t			mode;
+	const struct kernfs_ops	*kf_ops;
+	int (*seq_show)(struct kernfs_open_file *of,
+			struct seq_file *sf, void *v);
+	ssize_t (*write)(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off);
+	CK_KABI_RESERVE(1)
+	CK_KABI_RESERVE(2)
+	CK_KABI_RESERVE(3)
+	CK_KABI_RESERVE(4)
+};
+
+const struct cpumask *cpu_llc_mask(int cpu)
+{
+	struct sched_domain *llc = rcu_dereference(per_cpu(sd_llc, cpu));
+
+	if (!llc)
+		return cpumask_of(cpu);
+
+	return (const struct cpumask *)to_cpumask(llc->span);
+}
+
+const struct cpumask *cpu_die_mask(int cpu)
+{
+	return topology_die_cpumask(cpu);
+}
+
+const struct cpumask *cpu_core_mask(int cpu)
+{
+	return topology_core_cpumask(cpu);
+}
+
+const struct cpumask *cpu_root_mask(int cpu)
+{
+	return (const struct cpumask *)&root_cpumask;
+}
+
+#define GB_SD_INIT(type) \
+	.gb_flags = GROUP_BALANCER_##type##_FLAG, \
+	.topology_name = #type
+/*
+ * Group Balancer build group_balancer_sched_domains after kernel init,
+ * so the following cpumask can be got safely.
+ *
+ * smt mask:		cpu_smt_mask
+ * cluster mask:	cpu_clustergroup_mask
+ * mc mask:		cpu_coregroup_mask
+ * llc mask:		cpu_llc_mask
+ * die mask:		cpu_die_mask
+ * numa mask:		cpu_cpu_mask
+ * socket mask:		cpu_core_mask
+ * all mask:		cpu_root_mask
+ */
+static struct group_balancer_topology_level default_topology[] = {
+	{ cpu_root_mask, GB_SD_INIT(ROOT) },
+	{ cpu_core_mask, GB_SD_INIT(SOCKET) },
+#ifdef CONFIG_NUMA
+	{ cpu_cpu_mask, GB_SD_INIT(NUMA) },
+#endif
+	{ cpu_die_mask, GB_SD_INIT(DIE) },
+	{ cpu_llc_mask, GB_SD_INIT(LLC) },
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, GB_SD_INIT(MC) },
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+	{ cpu_clustergroup_mask, cpu_cluster_flags, GB_SD_INIT(CLUSTER) },
+#endif
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, GB_SD_INIT(SMT) },
+#endif
+	{ NULL, },
+};
+
+#define for_each_gb_topology_level(tl)			\
+	for (tl = default_topology; tl->mask; tl++)
+
+#define for_each_topology_level_sibling(pos, gb_tl)	\
+	list_for_each_entry(pos, &gb_tl->domains, topology_level_sibling)
+
+#define for_each_topology_level_sibling_safe(pos, n, gb_tl)	\
+	list_for_each_entry_safe(pos, n, &gb_tl->domains, topology_level_sibling)
+
+/* NR_CPUS is 1024 now, we set log(1024) + 1 = 11 levels. */
+#define NR_SIZE_LEVELS 11
+struct group_balancer_size_level default_size[NR_SIZE_LEVELS];
+
+#define for_each_gb_size_level(sl, i)			\
+	for (sl = default_size, i = 0; i < NR_SIZE_LEVELS; sl++, i++)
+
+#define for_each_gb_sd_child(pos, gb_sd)			\
+	list_for_each_entry(pos, &gb_sd->child, sibling)
+
+#define for_each_gb_sd_child_safe(pos, n, gb_sd)			\
+	list_for_each_entry_safe(pos, n, &gb_sd->child, sibling)
+
+#define group_balancer_sched_domain_first_child(gb_sd)		\
+	list_first_entry(&gb_sd->child, struct group_balancer_sched_domain, sibling)
+
+#define __gb_node_2_tg(node)	\
+	rb_entry((node), struct task_group, gb_node)
+
+struct group_balancer_sched_domain *group_balancer_root_domain;
+
+#define MAX_NAME_LEN		128
+#define GB_OVERLOAD		0x1
+#define GB_OVERUTILIZED		0x2
+
+static inline struct cpumask *gb_sd_span(struct group_balancer_sched_domain *gb_sd)
+{
+	return to_cpumask(gb_sd->span);
+}
+
+static unsigned int get_size_level(struct group_balancer_sched_domain *gb_sd)
+{
+	int size_level = ilog2(gb_sd->span_weight);
+
+	/* Prevent out-of-bound array access. */
+	if (unlikely(size_level < 0))
+		size_level = 0;
+	else if (unlikely(size_level >= NR_SIZE_LEVELS))
+		size_level = NR_SIZE_LEVELS - 1;
+
+	return (unsigned int)size_level;
+}
+
+static void __add_to_size_level(struct group_balancer_sched_domain *gb_sd,
+				unsigned int size_level)
+{
+	struct group_balancer_size_level *gb_sl;
+
+	gb_sl = &default_size[size_level];
+	list_add_tail(&gb_sd->size_level_sibling, &gb_sl->domains);
+}
+
+static void add_to_size_level(struct group_balancer_sched_domain *gb_sd)
+{
+	unsigned int size_level = get_size_level(gb_sd);
+
+	__add_to_size_level(gb_sd, size_level);
+}
+
+static int group_balancer_seqfile_show(struct seq_file *m, void *arg)
+{
+	struct kernfs_open_file *of = m->private;
+	struct gftype *gft = of->kn->priv;
+
+	if (gft->seq_show)
+		return gft->seq_show(of, m, arg);
+	return 0;
+}
+
+static ssize_t group_balancer_file_write(struct kernfs_open_file *of, char *buf,
+					 size_t nbytes, loff_t off)
+{
+	struct gftype *gft = of->kn->priv;
+
+	if (gft->write)
+		return gft->write(of, buf, nbytes, off);
+
+	return -EINVAL;
+}
+
+static const struct kernfs_ops group_balancer_kf_single_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.write			= group_balancer_file_write,
+	.seq_show		= group_balancer_seqfile_show,
+};
+
+struct group_balancer_sched_domain *kernfs_to_gb_sd(struct kernfs_node *kn)
+{
+	if (kernfs_type(kn) == KERNFS_DIR)
+		return kn->priv;
+	else
+		return kn->parent->priv;
+}
+
+struct group_balancer_sched_domain *group_balancer_kn_lock_live(struct kernfs_node *kn)
+{
+	struct group_balancer_sched_domain *gb_sd = kernfs_to_gb_sd(kn);
+
+	if (!gb_sd)
+		return NULL;
+
+	kernfs_break_active_protection(kn);
+	cpus_read_lock();
+	write_lock(&group_balancer_sched_domain_lock);
+
+	return gb_sd;
+}
+
+void group_balancer_kn_unlock(struct kernfs_node *kn)
+{
+	struct group_balancer_sched_domain *gb_sd = kernfs_to_gb_sd(kn);
+
+	if (!gb_sd)
+		return;
+
+	write_unlock(&group_balancer_sched_domain_lock);
+	cpus_read_unlock();
+	kernfs_unbreak_active_protection(kn);
+}
+
+static ssize_t group_balancer_cpus_write(struct kernfs_open_file *of,
+					 char *buf, size_t nbytes, loff_t off)
+{
+	cpumask_var_t new, tmp;
+	int cpu;
+	struct rq *rq;
+	struct group_balancer_sched_domain *gb_sd, *parent, *sibling, *child;
+	int old_size_level, new_size_level;
+	int ret = 0;
+
+	if (!buf)
+		return -EINVAL;
+	if (!zalloc_cpumask_var(&new, GFP_KERNEL))
+		return -ENOMEM;
+	if (!zalloc_cpumask_var(&tmp, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto free_new;
+	}
+
+	gb_sd = group_balancer_kn_lock_live(of->kn);
+	if (!gb_sd) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	ret = cpulist_parse(buf, new);
+	if (ret) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (cpumask_equal(new, gb_sd_span(gb_sd)))
+		goto unlock;
+
+	parent = gb_sd->parent;
+	if (parent) {
+		/* New mask must be subset of parent.*/
+		if (!cpumask_subset(new, gb_sd_span(parent))) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		/* New mask must not inersect with siblings. */
+		for_each_gb_sd_child(sibling, parent) {
+			if (gb_sd == sibling)
+				continue;
+			if (cpumask_intersects(new, gb_sd_span(sibling))) {
+				ret = -EINVAL;
+				goto unlock;
+			}
+		}
+	}
+
+	/* New mask must include all the cpus of the children. */
+	for_each_gb_sd_child(child, gb_sd) {
+		if (!cpumask_subset(gb_sd_span(child), new)) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	/*
+	 * rq->gb_sd points to the lowest level group_balancer_sched_domain
+	 * that includes the cpu.
+	 *
+	 * We define two types of cpumask here: 'less' and 'more'.
+	 * - 'less' is the cpus that new cpumask lacks.
+	 * - 'more' is the cpus that new cpumask newly adds.
+	 *
+	 * As the cpus of a child must be subset of its parent, the cpus in
+	 * 'less' and 'more' are not included by any child of gb_sd, and the
+	 * lowest level group_balancer_sched_domain that includes 'less' is
+	 * the parent of gb_sd, the lowest level group_balancer_sched_domain
+	 * that includes 'more' is gb_sd.
+	 *
+	 * So we need to set the rq->gb_sd of the cpus in 'less' to parent.
+	 * and set the rq->gb_sd of the cpus in 'more' to gb_sd.
+	 */
+	cpumask_andnot(tmp, gb_sd_span(gb_sd), new);
+	for_each_cpu(cpu, tmp) {
+		rq = cpu_rq(cpu);
+		rq->gb_sd = parent;
+	}
+
+	cpumask_andnot(tmp, new, gb_sd_span(gb_sd));
+	for_each_cpu(cpu, tmp) {
+		rq = cpu_rq(cpu);
+		rq->gb_sd = gb_sd;
+	}
+
+	old_size_level = get_size_level(gb_sd);
+	cpumask_copy(gb_sd_span(gb_sd), new);
+	gb_sd->span_weight = cpumask_weight(gb_sd_span(gb_sd));
+	gb_sd->lower_interval = ilog2(gb_sd->span_weight) * gb_sd->span_weight;
+	new_size_level = get_size_level(gb_sd);
+	if (old_size_level != new_size_level) {
+		list_del(&gb_sd->size_level_sibling);
+		__add_to_size_level(gb_sd, new_size_level);
+	}
+	if (gb_sd == group_balancer_root_domain)
+		cpumask_copy(&root_cpumask, new);
+
+unlock:
+	group_balancer_kn_unlock(of->kn);
+	free_cpumask_var(tmp);
+free_new:
+	free_cpumask_var(new);
+
+	return ret ?: nbytes;
+}
+
+static int group_balancer_cpus_show(struct kernfs_open_file *of,
+				    struct seq_file *s, void *v)
+{
+	struct group_balancer_sched_domain *gb_sd;
+	int ret = 0;
+
+	gb_sd = group_balancer_kn_lock_live(of->kn);
+
+	if (!gb_sd) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	seq_printf(s, "%*pbl\n", cpumask_pr_args(gb_sd_span(gb_sd)));
+unlock:
+	group_balancer_kn_unlock(of->kn);
+	return ret;
+}
+
+static struct gftype group_balancer_files[] = {
+	{
+		.name		= "cpus",
+		.mode		= 0644,
+		.kf_ops		= &group_balancer_kf_single_ops,
+		.write		= group_balancer_cpus_write,
+		.seq_show	= group_balancer_cpus_show,
+	},
+};
+
+static int group_balancer_kn_set_ugid(struct kernfs_node *kn)
+{
+	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+				.ia_uid = current_fsuid(),
+				.ia_gid = current_fsgid(), };
+
+	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+		return 0;
+
+	return kernfs_setattr(kn, &iattr);
+}
+
+static int group_balancer_add_file(struct kernfs_node *parent_kn, struct gftype *gft)
+{
+	struct kernfs_node *kn;
+	int ret;
+
+	kn = __kernfs_create_file(parent_kn, gft->name, gft->mode,
+				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+				  gft->kf_ops, gft, NULL, NULL);
+
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = group_balancer_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int group_balancer_add_files(struct kernfs_node *kn)
+{
+	struct gftype *gfts, *gft;
+	int ret, len;
+
+	gfts = group_balancer_files;
+	len = ARRAY_SIZE(group_balancer_files);
+
+	for (gft = gfts; gft < gfts + len; gft++) {
+		ret = group_balancer_add_file(kn, gft);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+err:
+	pr_err("Group Balancer: Failed to add sysfs file %s, err=%d\n", gft->name, ret);
+	while (--gft >= gfts)
+		kernfs_remove_by_name(kn, gft->name);
+
+	return ret;
+}
+
+static inline struct group_balancer_sched_domain
+*alloc_init_group_balancer_sched_domain(struct kernfs_node *parent, const char *name, umode_t mode)
+{
+	struct group_balancer_sched_domain *new, *ret;
+	struct kernfs_node *kn;
+	int retval;
+
+	if (!parent) {
+		ret = ERR_PTR(-ENOENT);
+		goto err_out;
+	}
+
+	new = kzalloc(sizeof(struct group_balancer_sched_domain) + cpumask_size(), GFP_KERNEL);
+	if (!new) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err_out;
+	}
+
+	kn = kernfs_create_dir(parent, name, mode, new);
+	if (IS_ERR(kn)) {
+		ret = (struct group_balancer_sched_domain *)kn;
+		goto free_new;
+	}
+	new->kn = kn;
+
+	retval = group_balancer_add_files(kn);
+	if (retval) {
+		ret = ERR_PTR(retval);
+		goto remove_kn;
+	}
+
+	INIT_LIST_HEAD(&new->child);
+	INIT_LIST_HEAD(&new->sibling);
+	INIT_LIST_HEAD(&new->topology_level_sibling);
+	INIT_LIST_HEAD(&new->size_level_sibling);
+
+	raw_spin_lock_init(&new->lock);
+	new->task_groups = RB_ROOT;
+
+	return new;
+remove_kn:
+	kernfs_remove(kn);
+free_new:
+	kfree(new);
+err_out:
+	pr_err("Group Balancer: Failed to allocate and init a new group balancer sched domain.\n");
+	return ret;
+}
+
+static void add_to_tree(struct group_balancer_sched_domain *gb_sd,
+			struct group_balancer_sched_domain *parent)
+{
+	int cpu;
+	struct rq *rq;
+
+	if (parent) {
+		list_add_tail(&gb_sd->sibling, &parent->child);
+		gb_sd->parent = parent;
+		parent->nr_children++;
+		/*
+		 * When we bi-divide the group balancer sched domain, the parent, middle layer,
+		 * hasn't been added to the tree yet, so for this case, we just let the depth
+		 * increase by 1.
+		 */
+		if (parent->depth)
+			gb_sd->depth = parent->depth + 1;
+		else
+			gb_sd->depth++;
+	} else {
+		gb_sd->depth = 0;
+	}
+	gb_sd->span_weight = cpumask_weight(gb_sd_span(gb_sd));
+	gb_sd->lower_interval = ilog2(gb_sd->span_weight) * gb_sd->span_weight;
+	gb_sd->free_tg_specs = 100 * gb_sd->span_weight;
+	add_to_size_level(gb_sd);
+
+	if (!gb_sd->nr_children) {
+		for_each_cpu(cpu, gb_sd_span(gb_sd)) {
+			rq = cpu_rq(cpu);
+			rq->gb_sd = gb_sd;
+		}
+	}
+}
+
+#define __node_2_task_group(n) rb_entry((n), struct task_group, gb_node)
+
+static inline bool tg_specs_less(struct rb_node *a, const struct rb_node *b)
+{
+	struct task_group *tg_a = __node_2_task_group(a);
+	struct task_group *tg_b = __node_2_task_group(b);
+	int specs_a = tg_a->specs_ratio;
+	int specs_b = tg_b->specs_ratio;
+
+	return specs_a < specs_b;
+}
+
+static int tg_set_gb_tg_down(struct task_group *tg, void *data)
+{
+	struct task_group *gb_tg = (struct task_group *)data;
+
+	tg->soft_cpus_allowed_ptr = gb_tg->soft_cpus_allowed_ptr;
+	tg->gb_tg = gb_tg;
+	tg_inc_soft_cpus_version(tg);
+
+	return 0;
+}
+
+static int tg_unset_gb_tg_down(struct task_group *tg, void *data)
+{
+	tg->soft_cpus_allowed_ptr = &tg->soft_cpus_allowed;
+	tg->gb_tg = NULL;
+	tg_inc_soft_cpus_version(tg);
+
+	return 0;
+}
+
+static void free_group_balancer_sched_domain(struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	struct rq *rq;
+	struct task_group *tg;
+	struct group_balancer_sched_domain *parent = gb_sd->parent;
+	struct rb_node *node;
+	struct rb_root *root = &gb_sd->task_groups;
+
+	if (parent) {
+		parent->nr_children--;
+		/* Move the task_groups to parent. */
+		while (!RB_EMPTY_ROOT(root)) {
+			node = root->rb_node;
+			tg = __node_2_task_group(node);
+			rb_erase(node, root);
+			rb_add(node, &parent->task_groups, tg_specs_less);
+			walk_tg_tree_from(tg, tg_set_gb_tg_down, tg_nop, tg);
+		}
+	}
+
+	list_del(&gb_sd->sibling);
+	list_del(&gb_sd->topology_level_sibling);
+	list_del(&gb_sd->size_level_sibling);
+
+	if (!gb_sd->nr_children) {
+		for_each_cpu(cpu, gb_sd_span(gb_sd)) {
+			rq = cpu_rq(cpu);
+			rq->gb_sd = gb_sd->parent;
+		}
+	}
+
+	if (gb_sd->kn)
+		kernfs_remove(gb_sd->kn);
+
+	kfree(gb_sd);
+}
+
+/* free group balancer sched domain tree from the leaf nodes. */
+static void free_group_balancer_sched_domains(void)
+{
+	struct group_balancer_sched_domain *parent, *child, *ancestor, *n;
+
+	parent = group_balancer_root_domain;
+down:
+	for_each_gb_sd_child_safe(child, n, parent) {
+		parent = child;
+		goto down;
+up:
+		continue;
+	}
+
+	ancestor = parent->parent;
+	/* root domain should always be in memory. */
+	if (parent != group_balancer_root_domain && !parent->nr_children) {
+		n = list_next_entry(parent, sibling);
+		free_group_balancer_sched_domain(parent);
+	}
+
+	child = n;
+	parent = ancestor;
+	if (parent)
+		goto up;
+}
+
+static int move_group_balancer_kernfs(struct group_balancer_sched_domain *gb_sd,
+				      struct group_balancer_sched_domain *new_parent)
+{
+	char *new_name = NULL;
+	int id = new_parent->nr_children;
+	int ret = 0;
+
+	if (!gb_sd->kn || !new_parent->kn)
+		return -ENOMEM;
+
+	new_name = kmalloc(MAX_NAME_LEN, GFP_KERNEL);
+	if (!new_name)
+		return -ENOMEM;
+	/*
+	 * We use domain+id as new name, and if the name is already occupied, we let id++,
+	 * until we find an unoccupied name.
+	 */
+	for (;;) {
+		struct kernfs_node *dup;
+
+		sprintf(new_name, "domain%d", id);
+		dup = kernfs_find_and_get(new_parent->kn, new_name);
+		if (!dup)
+			break;
+		kernfs_put(dup);
+		id++;
+	}
+
+	ret = kernfs_rename(gb_sd->kn, new_parent->kn, new_name);
+	kfree(new_name);
+
+	return ret;
+}
+
+static int move_group_balancer_sched_domain(struct group_balancer_sched_domain *child,
+					    struct group_balancer_sched_domain *new_parent,
+					    bool *is_first_child)
+{
+	int ret = 0;
+
+	ret = move_group_balancer_kernfs(child, new_parent);
+	if (ret)
+		return ret;
+
+	if (*is_first_child) {
+		*is_first_child = false;
+		new_parent->topology_name = child->topology_name;
+		new_parent->gb_flags = child->gb_flags;
+	}
+	cpumask_or(gb_sd_span(new_parent), gb_sd_span(child), gb_sd_span(new_parent));
+	list_del(&child->sibling);
+	child->parent->nr_children--;
+	list_add_tail(&child->sibling, &new_parent->child);
+	new_parent->nr_children++;
+	child->parent = new_parent;
+
+	return ret;
+}
+
+static int bi_divide_group_balancer_sched_domain(struct group_balancer_sched_domain *gb_sd)
+{
+	unsigned int weight = gb_sd->span_weight;
+	unsigned int half = (weight + 1) / 2;
+	unsigned int logn = ilog2(half);
+	/*
+	 * Find the power of 2 closest to half, and use this number
+	 * to split weight into two parts, left and right, and keep
+	 * left always the smaller one.
+	 */
+	unsigned int left = (half - (1 << logn) < (1 << (logn + 1)) - half) ?
+			    1 << logn : weight - (1 << (logn + 1));
+	bool is_first_child = true;
+	struct group_balancer_sched_domain *child, *n;
+	struct group_balancer_sched_domain *left_middle, *right_middle;
+	int ret = 0;
+
+	/*
+	 * If a domain has more than two children, we add a middle level.
+	 * For example, if a domain spans 48 cpus and 24 children, we add
+	 * a middle level first, which contains two children who span 16
+	 * and 32 cpus. And we will divide the new children in the next
+	 * loop.
+	 *
+	 * As for the size of middle level dividing, we choose powers of
+	 * two instead of half of span_weight, to make the division of
+	 * lower levels simpler.
+	 */
+	if (gb_sd->nr_children > 2) {
+		left_middle = alloc_init_group_balancer_sched_domain(gb_sd->kn,
+								     "left", 0);
+		if (IS_ERR(left_middle)) {
+			ret = PTR_ERR(left_middle);
+			goto err;
+		}
+
+		right_middle = alloc_init_group_balancer_sched_domain(gb_sd->kn,
+								      "right", 0);
+		if (IS_ERR(right_middle)) {
+			ret = PTR_ERR(right_middle);
+			goto free_left_middle;
+		}
+
+		for_each_gb_sd_child_safe(child, n, gb_sd) {
+			/*
+			 * Consider the following case, a domain spans 6
+			 * cpus and 3 chidlren(each child spans 2 cpus),
+			 * we just need to add right middle which spans 4
+			 * cpus.
+			 */
+			ret = move_group_balancer_sched_domain(child, left_middle,
+							       &is_first_child);
+			if (ret)
+				goto free_right_middle;
+
+			if (cpumask_weight(gb_sd_span(left_middle)) >= left)
+				break;
+		}
+
+		/*
+		 * As left is always the smaller one, it is possible that
+		 * left has only one child, if so, we delete the child.
+		 */
+		if (left_middle->nr_children == 1) {
+			child = group_balancer_sched_domain_first_child(left_middle);
+			free_group_balancer_sched_domain(child);
+		}
+
+		is_first_child = true;
+		for_each_gb_sd_child_safe(child, n, gb_sd) {
+			ret = move_group_balancer_sched_domain(child, right_middle,
+							       &is_first_child);
+			if (ret)
+				goto free_right_middle;
+		}
+
+		add_to_tree(left_middle, gb_sd);
+		add_to_tree(right_middle, gb_sd);
+		/* Uniform naming format. "left" and "right" are temporary name. */
+		ret = kernfs_rename(left_middle->kn, gb_sd->kn, "domain0");
+		if (ret)
+			goto err;
+		ret = kernfs_rename(right_middle->kn, gb_sd->kn, "domain1");
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+free_right_middle:
+	free_group_balancer_sched_domain(right_middle);
+free_left_middle:
+	free_group_balancer_sched_domain(left_middle);
+err:
+	free_group_balancer_sched_domains();
+	return ret;
+}
+
+/* DFS to bi-divide group balancer sched domains. */
+static int bi_divide_group_balancer_sched_domains(void)
+{
+	struct group_balancer_sched_domain *parent, *child;
+	int ret = 0;
+
+	/*
+	 * Traverse all the domains from the group_balancer_sched_domains list,
+	 * and add the new domains to the tail of the list, to ensure that all
+	 * the domains will be traversed.
+	 */
+	parent = group_balancer_root_domain;
+down:
+	ret = bi_divide_group_balancer_sched_domain(parent);
+	if (ret)
+		goto out;
+	for_each_gb_sd_child(child, parent) {
+		parent = child;
+		goto down;
+up:
+		continue;
+	}
+	if (parent == group_balancer_root_domain)
+		goto out;
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+out:
+	return ret;
+}
+
+/*
+ * After we build the tree, the depth may be not correct as we moved
+ * the subtree during the build process, so we correct the depth by
+ * recalculating.
+ */
+static void set_group_balancer_sched_domain_depth(void)
+{
+	struct group_balancer_sched_domain *parent, *child;
+
+	parent = group_balancer_root_domain;
+	parent->depth = 0;
+down:
+	for_each_gb_sd_child(child, parent) {
+		child->depth = parent->depth + 1;
+		parent = child;
+		goto down;
+up:
+		continue;
+	}
+	if (parent == group_balancer_root_domain)
+		goto out;
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+out:
+	return;
+}
+
+static int build_group_balancer_root_domain(void)
+{
+	struct group_balancer_sched_domain *root;
+
+	root = alloc_init_group_balancer_sched_domain(group_balancer_fs_root_kn, "root_domain", 0);
+	if (IS_ERR(root)) {
+		pr_err("Group Balancer: Failed to alloc group_balancer root domain.\n");
+		return PTR_ERR(root);
+	}
+	cpumask_copy(gb_sd_span(root), &root_cpumask);
+	list_add_tail(&root->topology_level_sibling, &default_topology[0].domains);
+	add_to_tree(root, NULL);
+	group_balancer_root_domain = root;
+
+	return 0;
+}
+
+/* BFS to build group balancer sched domain tree. */
+static int build_group_balancer_sched_domains(void)
+{
+	int cpu;
+	int ret;
+	cpumask_var_t trial_cpumask, child_cpumask;
+	struct group_balancer_topology_level *gb_tl, *next_gb_tl;
+	struct group_balancer_sched_domain *parent, *n;
+	char *name = NULL;
+
+	/*
+	 * The group balancer sched domain is a tree.
+	 * If the root was not built on boot, build the root node first.
+	 */
+	if (unlikely(!group_balancer_root_domain)) {
+		ret = build_group_balancer_root_domain();
+		if (ret)
+			goto err_out;
+	}
+
+	if (!zalloc_cpumask_var(&trial_cpumask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_out;
+	}
+	if (!zalloc_cpumask_var(&child_cpumask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_free_trial_cpumask;
+	}
+
+	name = kmalloc(MAX_NAME_LEN, GFP_KERNEL);
+	if (!name) {
+		ret = -ENOMEM;
+		goto err_free_domains;
+	}
+
+	/* Build the tree by level. */
+	for_each_gb_topology_level(gb_tl) {
+		if (gb_tl->skip)
+			continue;
+		next_gb_tl = gb_tl + 1;
+		while (next_gb_tl->skip && next_gb_tl->mask)
+			next_gb_tl++;
+		if (!next_gb_tl->mask)
+			break;
+		/* Build children from parent level. */
+		rcu_read_lock();
+		for_each_topology_level_sibling_safe(parent, n, gb_tl) {
+			/*
+			 * If the cpumasks of the adjacent topology levels are the same,
+			 * we move the domain to the next level, to make the loop
+			 * continue.
+			 */
+			cpu = cpumask_first(gb_sd_span(parent));
+			cpumask_and(child_cpumask, &root_cpumask, next_gb_tl->mask(cpu));
+			if (cpumask_equal(gb_sd_span(parent), child_cpumask)) {
+				list_del(&parent->topology_level_sibling);
+				list_add_tail(&parent->topology_level_sibling,
+					 &next_gb_tl->domains);
+				parent->gb_flags &= next_gb_tl->gb_flags;
+				continue;
+			}
+			cpumask_copy(trial_cpumask, gb_sd_span(parent));
+			for_each_cpu(cpu, trial_cpumask) {
+				struct group_balancer_sched_domain *child;
+
+				cpumask_and(child_cpumask, &root_cpumask, next_gb_tl->mask(cpu));
+				cpumask_andnot(trial_cpumask, trial_cpumask, child_cpumask);
+				/*
+				 * parent->nr_children is  a variable that only increases and never
+				 * decreases at this stage. So if we use domain+nr_children as name,
+				 * there will be no duplicate names.
+				 */
+				sprintf(name, "domain%d", parent->nr_children);
+				child = alloc_init_group_balancer_sched_domain(parent->kn, name, 0);
+				if (IS_ERR(child)) {
+					ret = PTR_ERR(child);
+					rcu_read_unlock();
+					goto err_free_name;
+				}
+				cpumask_copy(gb_sd_span(child), child_cpumask);
+				child->topology_name = next_gb_tl->topology_name;
+				list_add_tail(&child->topology_level_sibling, &next_gb_tl->domains);
+				child->gb_flags &= next_gb_tl->gb_flags;
+				add_to_tree(child, parent);
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	kfree(name);
+	free_cpumask_var(child_cpumask);
+	free_cpumask_var(trial_cpumask);
+	return bi_divide_group_balancer_sched_domains();
+
+err_free_name:
+	kfree(name);
+err_free_domains:
+	free_group_balancer_sched_domains();
+	free_cpumask_var(child_cpumask);
+err_free_trial_cpumask:
+	free_cpumask_var(trial_cpumask);
+err_out:
+	return ret;
+}
+
+static inline struct group_balancer_fs_context *group_balancer_fc2context(struct fs_context *fc)
+{
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	return container_of(kfc, struct group_balancer_fs_context, kfc);
+}
+
+
+static int group_balancer_get_tree(struct fs_context *fc)
+{
+
+	return kernfs_get_tree(fc);
+}
+
+static void group_balancer_fs_context_free(struct fs_context *fc)
+{
+	struct group_balancer_fs_context *ctx = group_balancer_fc2context(fc);
+
+	kernfs_free_fs_context(fc);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations group_balancer_context_ops = {
+	.free			= group_balancer_fs_context_free,
+	.get_tree		= group_balancer_get_tree,
+};
+
+static int group_balancer_init_fs_context(struct fs_context *fc)
+{
+	struct group_balancer_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct group_balancer_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->kfc.root = group_balancer_fs_root;
+	ctx->kfc.magic = GROUP_BALANCER_MAGIC;
+	fc->fs_private = &ctx->kfc;
+	fc->ops = &group_balancer_context_ops;
+	put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(&init_user_ns);
+	fc->global = true;
+	return 0;
+}
+
+static void group_balancer_kill_sb(struct super_block *sb)
+{
+	kernfs_kill_sb(sb);
+}
+
+static struct file_system_type group_balancer_fs_type = {
+	.name			= "group_balancer",
+	.init_fs_context	= group_balancer_init_fs_context,
+	.kill_sb		= group_balancer_kill_sb,
+};
+
+static int group_balancer_mkdir(struct kernfs_node *kn, const char *name, umode_t mode)
+{
+	struct group_balancer_sched_domain *new;
+	struct group_balancer_sched_domain *parent = kernfs_to_gb_sd(kn);
+
+	if (kn == group_balancer_fs_root_kn)
+		return -EPERM;
+
+	group_balancer_kn_lock_live(kn);
+	new = alloc_init_group_balancer_sched_domain(kn, name, mode);
+	add_to_tree(new, parent);
+	group_balancer_kn_unlock(kn);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	return 0;
+}
+
+static int group_balancer_rmdir(struct kernfs_node *kn)
+{
+	struct group_balancer_sched_domain *gb_sd;
+	int ret = 0;
+
+	gb_sd = kn->priv;
+
+	if (gb_sd == group_balancer_root_domain) {
+		ret = -EPERM;
+		goto unlock;
+	}
+	if (gb_sd->nr_children) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	group_balancer_kn_lock_live(kn);
+	free_group_balancer_sched_domain(gb_sd);
+
+unlock:
+	group_balancer_kn_unlock(kn);
+	return ret;
+}
+
+static struct kernfs_syscall_ops group_balancer_kf_syscall_ops = {
+	.mkdir		= group_balancer_mkdir,
+	.rmdir		= group_balancer_rmdir,
+};
+
+void sched_init_group_balancer_levels(void)
+{
+	struct group_balancer_topology_level *tl;
+	struct group_balancer_size_level *sl;
+	int i;
+
+	for_each_gb_topology_level(tl)
+		INIT_LIST_HEAD(&tl->domains);
+
+	for_each_gb_size_level(sl, i) {
+		sl->size = 1<<i;
+		INIT_LIST_HEAD(&sl->domains);
+	}
+}
+
+/*
+ * Here are some cases that some topologies are not reported correctly,
+ * e.g., on some virtual machines, DIE cpumask is incorrect, which only
+ * includes one cpu.
+ * To avoid building incorrect group balancer sched domains due to this
+ * kind of incorrect topology, we check whether the topology is correct,
+ * and if not, we mark it should be skipped.
+ */
+static void validate_topology_levels(void)
+{
+	struct group_balancer_topology_level *gb_tl, *next_gb_tl;
+	int i;
+
+	for (i = 1; i < NR_GROUP_BALANCER_TOPOLOGY - 1; i++) {
+		gb_tl = &default_topology[i];
+		next_gb_tl = &default_topology[i + 1];
+		if (!next_gb_tl->mask)
+			break;
+		rcu_read_lock();
+		if (!cpumask_subset(next_gb_tl->mask(0), gb_tl->mask(0)) ||
+		    (cpumask_weight(gb_tl->mask(0)) <= 1))
+			gb_tl->skip = true;
+		rcu_read_unlock();
+	}
+}
+
+void sched_init_group_balancer_sched_domains(void)
+{
+	int ret;
+
+	cpus_read_lock();
+	write_lock(&group_balancer_sched_domain_lock);
+	ret = build_group_balancer_sched_domains();
+	if (ret)
+		pr_err("Group Balancer: Failed to build group balancer sched domains: %d\n", ret);
+	else
+		pr_info("Group Balancer: Build group balancer sched domains successfully.\n");
+	set_group_balancer_sched_domain_depth();
+	write_unlock(&group_balancer_sched_domain_lock);
+	cpus_read_unlock();
+}
+
+void sched_clear_group_balancer_sched_domains(void)
+{
+	cpus_read_lock();
+	write_lock(&group_balancer_sched_domain_lock);
+	free_group_balancer_sched_domains();
+	pr_info("Group Balancer: Free group balancer sched domains.\n");
+	write_unlock(&group_balancer_sched_domain_lock);
+	cpus_read_unlock();
+}
+
+static int __init sched_init_group_balancer_kernfs(void)
+{
+	int ret = 0;
+
+	group_balancer_fs_root = kernfs_create_root(&group_balancer_kf_syscall_ops, 0, NULL);
+	if (IS_ERR(group_balancer_fs_root))
+		return PTR_ERR(group_balancer_fs_root);
+
+	group_balancer_fs_root_kn = kernfs_root_to_node(group_balancer_fs_root);
+
+	ret = sysfs_create_mount_point(fs_kobj, "group_balancer");
+	if (ret)
+		goto cleanup_root;
+
+	pr_info("Group Balancer: Created group balancer mount point.\n");
+	ret = register_filesystem(&group_balancer_fs_type);
+	if (ret)
+		goto cleanup_mountpoint;
+
+	pr_info("Group Balancer: Registered group balancer file system.\n");
+
+	return 0;
+
+cleanup_mountpoint:
+	sysfs_remove_mount_point(fs_kobj, "group_balancer");
+cleanup_root:
+	kernfs_destroy_root(group_balancer_fs_root);
+	pr_err("Group Balancer: Failed to register group balancer file system.\n");
+	return ret;
+}
+
+void update_group_balancer_root_cpumask(void)
+{
+	cpumask_copy(&root_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+}
+
+static int __init group_balancer_init(void)
+{
+	int ret;
+
+	update_group_balancer_root_cpumask();
+	sched_init_group_balancer_levels();
+	validate_topology_levels();
+	ret = sched_init_group_balancer_kernfs();
+	if (ret)
+		return ret;
+	return build_group_balancer_root_domain();
+}
+
+late_initcall(group_balancer_init);
+
+static void __exit sched_exit_group_balancer_kernfs(void)
+{
+	unregister_filesystem(&group_balancer_fs_type);
+	sysfs_remove_mount_point(fs_kobj, "group_balancer");
+	kernfs_destroy_root(group_balancer_fs_root);
+	group_balancer_fs_root_kn = NULL;
+}
+
+__exitcall(sched_exit_group_balancer_kernfs);
+
+static unsigned long tg_gb_sd_load(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	unsigned long load = 0;
+
+	for_each_cpu(cpu, gb_sd_span(gb_sd))
+		load += cfs_h_load(tg->cfs_rq[cpu]);
+
+	return load;
+}
+
+static unsigned long tg_gb_sd_util(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	unsigned long util = 0;
+
+	for_each_cpu(cpu, gb_sd_span(gb_sd))
+		util += READ_ONCE(tg->cfs_rq[cpu]->avg.util_est.enqueued);
+
+	return util;
+}
+
+static unsigned long gb_sd_load(struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	unsigned long load = 0;
+
+	for_each_cpu(cpu, gb_sd_span(gb_sd))
+		load += cpu_rq(cpu)->cfs.avg.load_avg;
+
+	return load;
+}
+
+static unsigned long gb_sd_capacity(struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	int cap = 0;
+
+	for_each_cpu(cpu, gb_sd_span(gb_sd))
+		cap += cpu_rq(cpu)->cpu_capacity;
+
+	return cap;
+}
+
+static struct group_balancer_sched_domain *select_idle_gb_sd(int specs)
+{
+	struct group_balancer_sched_domain *gb_sd, *child;
+
+	if (specs == -1 || specs > group_balancer_root_domain->span_weight * 100)
+		return group_balancer_root_domain;
+
+	gb_sd = group_balancer_root_domain;
+
+	while (gb_sd) {
+		struct group_balancer_sched_domain *max_free_child = NULL;
+		int max_free_specs = INT_MIN;
+		struct group_balancer_sched_domain *max_unsatisfied_free_child = NULL;
+		int max_unsatisfied_free_specs = INT_MIN;
+
+		for_each_gb_sd_child(child, gb_sd) {
+			if (child->span_weight * 100 >= specs &&
+			    child->free_tg_specs > max_free_specs) {
+				max_free_child = child;
+				max_free_specs = child->free_tg_specs;
+			} else if (child->span_weight * 100 < specs &&
+				   child->free_tg_specs > max_unsatisfied_free_specs) {
+				max_unsatisfied_free_child = child;
+				max_unsatisfied_free_specs = child->free_tg_specs;
+			}
+		}
+		if (!max_free_child)
+			break;
+		/*
+		 * Consider the following case:
+		 * gb_sd->span_weight = 6, and gb_sd has two children whose weight are 2 and 4,
+		 * and there is a task group with specs 300 selected the child with weight 4
+		 * before, and there's another task group with specs 300 needs to select a sched
+		 * domain.
+		 * In this case, it's unreasonable to select the child with weight 4 for both task
+		 * groups. So we select gb_sd to workaround.
+		 * When compare the free specs of two group balancer sched domain, we'd better to
+		 * compare the proportion of the free specs to the span weight, because the free
+		 * specs cannot fully represent the degree of idleness if the span weight is
+		 * different.
+		 */
+		if (max_free_specs < specs &&
+		    max_free_specs / max_free_child->span_weight <
+		    max_unsatisfied_free_specs / max_unsatisfied_free_child->span_weight)
+			break;
+		gb_sd = max_free_child;
+	}
+
+	return gb_sd;
+}
+
+static void
+check_task_group_leap_level(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	struct group_balancer_sched_domain *child;
+	int specs = tg->specs_ratio;
+
+	for_each_gb_sd_child(child, gb_sd) {
+		if (specs <= 100 * child->span_weight) {
+			tg->leap_level = true;
+			tg->leap_level_timestamp = jiffies;
+			return;
+		}
+	}
+
+	tg->leap_level = false;
+}
+
+/*
+ * When we attach/detach a task group to/from a domain, we hold the read lock
+ * group_balancer_sched_domain_lock first, and then hold gb_sd->lock.
+ * When we free a domain, we need to move task groups from domain to its parent, we
+ * hold the write lock group_balancer_sched_domain_lock.
+ * When we balance two domains, we hold the read lock group_balancer_sched_domain_lock
+ * first, and then hold the locks of these two domains.
+ * So that there will be no race.
+ * TODO: Optimize the lock when we move task groups when balance.
+ */
+void add_tg_to_group_balancer_sched_domain_locked(struct task_group *tg,
+						  struct group_balancer_sched_domain *gb_sd,
+						  bool enable)
+{
+	int specs = tg->specs_ratio;
+	struct group_balancer_sched_domain *parent;
+
+	tg->gb_sd = gb_sd;
+	rb_add(&tg->gb_node, &gb_sd->task_groups, tg_specs_less);
+
+	if (specs != -1) {
+		for (parent = gb_sd; parent; parent = parent->parent) {
+			raw_spin_lock(&parent->lock);
+			parent->free_tg_specs -= specs;
+			raw_spin_unlock(&parent->lock);
+		}
+	}
+
+	tg->soft_cpus_allowed_ptr = gb_sd_span(gb_sd);
+	if (enable)
+		walk_tg_tree_from(tg, tg_set_gb_tg_down, tg_nop, tg);
+
+	check_task_group_leap_level(tg, gb_sd);
+}
+
+void add_tg_to_group_balancer_sched_domain(struct task_group *tg,
+					   struct group_balancer_sched_domain *gb_sd,
+					   bool enable)
+{
+	raw_spin_lock(&gb_sd->lock);
+	add_tg_to_group_balancer_sched_domain_locked(tg, gb_sd, enable);
+	raw_spin_unlock(&gb_sd->lock);
+}
+
+static void
+remove_tg_from_group_balancer_sched_domain_locked(struct task_group *tg,
+						  struct group_balancer_sched_domain *gb_sd,
+						  bool disable)
+{
+	int specs = tg->specs_ratio;
+
+	tg->gb_sd = NULL;
+	rb_erase(&tg->gb_node, &gb_sd->task_groups);
+	if (specs != -1) {
+		for (; gb_sd; gb_sd = gb_sd->parent) {
+			raw_spin_lock(&gb_sd->lock);
+			gb_sd->free_tg_specs += specs;
+			raw_spin_unlock(&gb_sd->lock);
+		}
+	}
+
+	if (disable)
+		walk_tg_tree_from(tg, tg_unset_gb_tg_down, tg_nop, NULL);
+}
+
+static void
+remove_tg_from_group_balancer_sched_domain(struct task_group *tg,
+					   struct group_balancer_sched_domain *gb_sd,
+					   bool disable)
+{
+	read_lock(&group_balancer_sched_domain_lock);
+	raw_spin_lock(&gb_sd->lock);
+	remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, disable);
+	raw_spin_unlock(&gb_sd->lock);
+	read_unlock(&group_balancer_sched_domain_lock);
+}
+
+int attach_tg_to_group_balancer_sched_domain(struct task_group *tg,
+					     struct group_balancer_sched_domain *target,
+					     bool enable)
+{
+	struct group_balancer_sched_domain *gb_sd;
+	int ret = 0;
+
+	read_lock(&group_balancer_sched_domain_lock);
+	if (enable)
+		gb_sd = select_idle_gb_sd(tg->specs_ratio);
+	else
+		gb_sd = target;
+	if (!gb_sd) {
+		ret = -ESRCH;
+		goto out;
+	}
+	add_tg_to_group_balancer_sched_domain(tg, gb_sd, enable);
+out:
+	read_unlock(&group_balancer_sched_domain_lock);
+	return ret;
+}
+
+void detach_tg_from_group_balancer_sched_domain(struct task_group *tg, bool disable)
+{
+	struct group_balancer_sched_domain *gb_sd = tg->gb_sd;
+
+	if (!gb_sd)
+		return;
+
+	remove_tg_from_group_balancer_sched_domain(tg, gb_sd, disable);
+}
+
+static void tg_upper_level(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	detach_tg_from_group_balancer_sched_domain(tg, false);
+	/* Considering the case that gb_sd is NULL, we'd better to treat it as enable. */
+	attach_tg_to_group_balancer_sched_domain(tg, gb_sd, !gb_sd);
+}
+
+static bool tg_lower_level(struct task_group *tg)
+{
+	struct group_balancer_sched_domain *gb_sd = tg->gb_sd;
+	struct group_balancer_sched_domain *child, *dst;
+	unsigned long tg_child_load, tg_load = 0, tg_dst_load = 0;
+	unsigned long child_load, src_load, dst_load, total_load = 0, migrate_load;
+	unsigned long child_cap, total_cap = 0, src_cap, dst_cap = 0;
+	unsigned long src_imb, dst_imb;
+
+	if (!gb_sd)
+		goto fail;
+
+	/*
+	 * The gb_sd may have some children, and the tasks of may spread across each child.
+	 * Lowering the level of tg is essentially a gathering task, so we will find a child
+	 * that contains the most load of tg to migrate tg to.
+	 */
+	for_each_gb_sd_child(child, gb_sd) {
+		child_load = gb_sd_load(gb_sd);
+		total_load += child_load;
+
+		child_cap = gb_sd_capacity(child);
+		total_cap += child_cap;
+
+		tg_child_load = tg_gb_sd_load(tg, gb_sd);
+		if (tg_child_load > tg_dst_load) {
+			dst = child;
+			tg_dst_load = tg_child_load;
+			dst_load = child_load;
+			dst_cap = child_cap;
+		}
+		tg_load += tg_child_load;
+	}
+
+	if (tg->specs_ratio > 100 * dst->span_weight)
+		goto fail;
+#ifdef CONFIG_NUMA
+	/* We won't allow a task group span more than two numa nodes too long. */
+	if (dst->gb_flags & GROUP_BALANCER_NUMA_FLAG)
+		goto lower;
+#endif
+	/* If we lower the level, we have to make sure that we will not cause imbalance.
+	 *
+	 * src_load        dst_load
+	 * ------------ vs ---------
+	 * src_capacity    dst_capacity
+	 *
+	 */
+
+	migrate_load = tg_load - tg_dst_load;
+
+	src_cap = total_cap - dst_cap;
+	src_load = total_load - dst_load;
+	src_imb = abs(src_load * dst_cap - dst_load * src_cap);
+	dst_imb = abs((src_load - migrate_load) * dst_cap - (dst_load + migrate_load) * src_cap);
+
+	if (dst_imb > src_imb)
+		goto fail;
+#ifdef CONFIG_NUMA
+lower:
+#endif
+	detach_tg_from_group_balancer_sched_domain(tg, false);
+	attach_tg_to_group_balancer_sched_domain(tg, dst, false);
+	/* The task group maybe still leap level, check it. */
+	check_task_group_leap_level(tg, gb_sd);
+
+	return true;
+fail:
+	tg->leap_level_timestamp = jiffies;
+	return false;
+}
+
+static void gb_task_group_tick(struct task_group *tg)
+{
+	struct group_balancer_sched_domain *gb_sd = tg->gb_sd;
+
+	if (!gb_sd)
+		return;
+
+	if (!tg->leap_level)
+		return;
+
+	if (!time_after(jiffies, tg->leap_level_timestamp + gb_sd->lower_interval))
+		return;
+
+	read_lock(&group_balancer_sched_domain_lock);
+	tg_lower_level(tg);
+	read_unlock(&group_balancer_sched_domain_lock);
+}
+
+static struct task_group *gb_task_group(struct task_struct *p)
+{
+	struct task_group *tg = task_group(p);
+
+	if (tg == &root_task_group || task_group_is_autogroup(tg))
+		return NULL;
+
+	return task_group(p)->gb_tg;
+}
+
+void gb_task_tick(struct task_struct *p)
+{
+	struct task_group *tg = gb_task_group(p);
+
+	if (!group_balancer_enabled())
+		return;
+
+	if (!tg || !tg->group_balancer)
+		return;
+
+	if (!raw_spin_trylock(&tg->gb_lock))
+		return;
+
+	gb_task_group_tick(tg);
+	raw_spin_unlock(&tg->gb_lock);
+}
+
+void tg_specs_change(struct task_group *tg)
+{
+	struct group_balancer_sched_domain *gb_sd;
+	int specs = tg->specs_ratio;
+
+	gb_sd = tg->gb_sd;
+	if (!gb_sd)
+		/* tg->group_balancer is always true here, so find a gb_sd to attach. */
+		goto upper;
+
+	/* If the task group leaps level after specs change, we will lower it later. */
+	check_task_group_leap_level(tg, gb_sd);
+	if (tg->leap_level)
+		return;
+
+	/* This gb_sd still satisfy, don't do anything. */
+	if (specs <= gb_sd->span_weight * 100 || gb_sd == group_balancer_root_domain)
+		return;
+
+	/* The specs doesn't satisfy anymore, upper to find a satisfied gb_sd. */
+	/* Fast path, if the specs is -1 or too large, move it to root domain. */
+	if (specs == -1 || specs > group_balancer_root_domain->span_weight * 100) {
+		gb_sd = group_balancer_root_domain;
+		goto upper;
+	}
+
+	for (; gb_sd; gb_sd = gb_sd->parent) {
+		if (specs <= gb_sd->span_weight * 100)
+			break;
+	}
+
+	if (!gb_sd)
+		gb_sd = group_balancer_root_domain;
+upper:
+	tg_upper_level(tg, gb_sd);
+
+}
+
+static struct group_balancer_sched_domain
+*find_matching_gb_sd(struct group_balancer_sched_domain **src,
+		     struct group_balancer_sched_domain **dst)
+{
+	int src_depth, dst_depth;
+
+	if (!*src || !*dst || *src == *dst)
+		return NULL;
+
+	src_depth = (*src)->depth;
+	dst_depth = (*dst)->depth;
+
+	if (!src_depth || !dst_depth)
+		return NULL;
+
+	while (src_depth > dst_depth) {
+		src_depth--;
+		*src = (*src)->parent;
+	}
+
+	while (dst_depth > src_depth) {
+		dst_depth--;
+		*dst = (*dst)->parent;
+	}
+
+
+	while ((*src)->parent != (*dst)->parent) {
+		*src = (*src)->parent;
+		*dst = (*dst)->parent;
+		if (!*src || !*dst)
+			return NULL;
+	}
+
+	return (*src)->parent;
+}
+
+#define gb_for_each_tg_safe(pos, n, root)					\
+	for (pos = rb_entry_safe(rb_first(root), struct task_group, gb_node);	\
+	     pos && ({ n = rb_entry_safe(rb_next(&pos->gb_node),		\
+			struct task_group, gb_node) ; 1; });				\
+	     pos = n)
+
+static int gb_detach_task_groups(struct gb_lb_env *gb_env)
+{
+	struct group_balancer_sched_domain *gb_sd, *child;
+	struct task_group *tg, *n;
+	unsigned long load, util;
+	int detached = 0;
+
+	gb_sd = gb_env->gb_sd;
+	if (!gb_sd)
+		return 0;
+
+	for_each_gb_sd_child(child, gb_sd) {
+		raw_spin_lock(&child->lock);
+		/* Try the task cgroups with little specs first. */
+		gb_for_each_tg_safe(tg, n, &child->task_groups) {
+			switch (gb_env->migration_type) {
+#ifdef CONFIG_GROUP_IDENTITY
+			case migrate_identity:
+				fallthrough;
+#endif
+			case migrate_load:
+				load = max_t(unsigned long, tg_gb_sd_load(tg, gb_sd), 1);
+				if (shr_bound(load, gb_env->nr_balance_failed) > gb_env->imbalance)
+					continue;
+				gb_env->imbalance -= load;
+				break;
+			case migrate_util:
+				util = tg_gb_sd_util(tg, gb_sd);
+				if (shr_bound(util, gb_env->nr_balance_failed) > gb_env->imbalance)
+					continue;
+				gb_env->imbalance -= util;
+				break;
+			case migrate_task:
+				gb_env->imbalance = 0;
+				break;
+			/*TODO: Perfect strategy of migrate_misfit*/
+			case migrate_misfit:
+				gb_env->imbalance = 0;
+				break;
+			}
+			remove_tg_from_group_balancer_sched_domain_locked(tg, child, false);
+			rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less);
+			detached++;
+			if (gb_env->imbalance <= 0) {
+				raw_spin_unlock(&child->lock);
+				return detached;
+			}
+		}
+		raw_spin_unlock(&child->lock);
+	}
+
+	return detached;
+}
+
+static void gb_attach_task_groups(struct gb_lb_env *gb_env)
+{
+	struct task_group *tg;
+	struct group_balancer_sched_domain *gb_sd = gb_env->gb_sd;
+	struct rb_node *node;
+	struct rb_root *root = &gb_env->task_groups;
+
+	raw_spin_lock(&gb_sd->lock);
+	while (!RB_EMPTY_ROOT(root)) {
+		node = root->rb_node;
+		tg = __gb_node_2_tg(node);
+		rb_erase(node, &gb_env->task_groups);
+		add_tg_to_group_balancer_sched_domain_locked(tg, gb_sd, false);
+	}
+	raw_spin_unlock(&gb_sd->lock);
+}
+
+static void __update_gb_sd_status(struct group_balancer_sched_domain *gb_sd, int *gb_sd_status)
+{
+	int i, nr_running;
+
+	for_each_cpu(i, gb_sd_span(gb_sd)) {
+		struct rq *rq = cpu_rq(i);
+
+		nr_running = rq->nr_running;
+		if (nr_running > 1)
+			*gb_sd_status |= GB_OVERLOAD;
+
+		if (gb_cpu_overutilized(i))
+			*gb_sd_status |= GB_OVERUTILIZED;
+	}
+}
+
+static void update_gb_sd_status(struct gb_lb_env *gb_env, int *gb_sd_status)
+{
+	if (!gb_env->src)
+		return;
+
+	__update_gb_sd_status(gb_env->src, gb_sd_status);
+}
+
+void gb_load_balance(struct lb_env *env)
+{
+	struct rq *src_rq = env->src_rq, *dst_rq = env->dst_rq;
+	struct gb_lb_env gb_env;
+	struct group_balancer_sched_domain *src, *dst, *gb_sd, *parent;
+	struct rb_node *node;
+	struct task_group *tg;
+	int gb_sd_status = 0;
+	struct cpumask *gb_mask = this_cpu_cpumask_var_ptr(group_balancer_mask);
+
+	if (!group_balancer_enabled())
+		return;
+
+	/*
+	 * src cpu has balanced some task groups to dst cpu during this load balance
+	 * process, skip it.
+	 */
+	if (cpumask_test_cpu(env->src_cpu, gb_mask))
+		return;
+
+	if (!src_rq || !dst_rq)
+		return;
+
+	read_lock(&group_balancer_sched_domain_lock);
+
+	src = src_rq->gb_sd;
+	dst = dst_rq->gb_sd;
+
+	gb_sd = find_matching_gb_sd(&src, &dst);
+	if (!gb_sd)
+		goto unlock;
+
+	gb_env = (struct gb_lb_env){
+		.src			= src,
+		.dst			= dst,
+		.gb_sd			= gb_sd,
+		.migration_type		= env->migration_type,
+		.imbalance		= env->imbalance,
+		.nr_balance_failed	= env->sd->nr_balance_failed,
+		.task_groups		= RB_ROOT,
+	};
+
+	/*
+	 * If there are some tasks belongs to gb_sd or any ancestor, they can be migrated,
+	 * and we don't migrate tg in this case.
+	 */
+	for (parent = gb_sd; parent; parent = parent->parent) {
+		for (node = rb_first(&parent->task_groups); node; node = rb_next(node)) {
+			tg = __node_2_task_group(node);
+			if (tg->cfs_rq[env->src_cpu]->h_nr_runnable)
+				goto unlock;
+		}
+	}
+
+	update_gb_sd_status(&gb_env, &gb_sd_status);
+	/*
+	 * If the src domain is not overloaded, or there no imbalance between src and dst domain,
+	 * do not migrate task groups.
+	 */
+	if (!gb_sd_status || !gb_env.imbalance)
+		goto out;
+
+	if (gb_detach_task_groups(&gb_env))
+		gb_attach_task_groups(&gb_env);
+
+out:
+	cpumask_or(gb_mask, gb_mask, gb_sd_span(gb_sd));
+unlock:
+	read_unlock(&group_balancer_sched_domain_lock);
+}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 33dcd35605abc8ee462a60db7aeba852df007212..50cf5dd9b770a6a8ad91e0e3689d640d004420ff 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -394,6 +394,7 @@ static ssize_t write_dyn_isolcpus(struct file *file, const char __user *buf,
 	update_wilds_cpumask(new_allowed, old_allowed);
 
 	rebuild_sched_domains();
+	update_group_balancer_root_cpumask();
 	workqueue_set_unbound_cpumask(new_allowed);
 
 free_all:
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 174a1fa8a6f69207a53d1a238b8a838e99176bdb..c779ef86a86e93a0abeae6dc5d6bd908da1e37e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -104,6 +104,53 @@
 
 struct rq;
 struct cpuidle_state;
+#ifdef CONFIG_GROUP_BALANCER
+struct group_balancer_sched_domain;
+#endif
+
+#ifdef CONFIG_SMP
+enum migration_type {
+	migrate_load = 0,
+	migrate_util,
+	migrate_task,
+	migrate_misfit,
+#ifdef CONFIG_GROUP_IDENTITY
+	migrate_identity
+#endif
+};
+
+enum fbq_type { regular, remote, all };
+
+struct lb_env {
+	struct sched_domain	*sd;
+
+	struct rq		*src_rq;
+	int			src_cpu;
+
+	int			dst_cpu;
+	struct rq		*dst_rq;
+
+	struct cpumask		*dst_grpmask;
+	int			new_dst_cpu;
+	enum cpu_idle_type	idle;
+	long			imbalance;
+	/* The set of CPUs under consideration for load-balancing */
+	struct cpumask		*cpus;
+
+	unsigned int		flags;
+
+	unsigned int		loop;
+	unsigned int		loop_break;
+	unsigned int		loop_max;
+
+	enum fbq_type		fbq_type;
+	enum migration_type	migration_type;
+	struct list_head	tasks;
+#ifdef CONFIG_GROUP_IDENTITY
+	bool			id_need_redo;
+#endif
+};
+#endif
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
@@ -540,6 +587,20 @@ struct task_group {
 	struct sched_cgroup_lat_stat_cpu __percpu *lat_stat_cpu;
 #endif
 
+#ifdef CONFIG_GROUP_BALANCER
+	const cpumask_t		*soft_cpus_allowed_ptr;
+	cpumask_t		soft_cpus_allowed;
+	int			soft_cpus_version;
+	int			specs_ratio;
+	struct rb_node		gb_node;
+	struct group_balancer_sched_domain *gb_sd;
+	struct task_group	*gb_tg;
+	bool			group_balancer;
+	bool			leap_level;
+	unsigned long		leap_level_timestamp;
+	raw_spinlock_t		gb_lock;
+#endif
+
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
@@ -1401,6 +1462,10 @@ struct rq {
 	u64 last_acpu_update_time_task;
 #endif
 
+#ifdef CONFIG_GROUP_BALANCER
+	struct group_balancer_sched_domain *gb_sd;
+#endif
+
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
@@ -4166,4 +4231,55 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
 
 #include "ext.h"
 
+#ifdef CONFIG_GROUP_BALANCER
+extern bool group_balancer_enabled(void);
+static inline const struct cpumask *task_allowed_cpu(struct task_struct *p)
+{
+	if (group_balancer_enabled()) {
+		struct task_group *tg = task_group(p);
+
+		if (unlikely(p->soft_cpus_version != tg->soft_cpus_version)) {
+			cpumask_and(&p->cpus_allowed_alt, p->cpus_ptr,
+				    tg->soft_cpus_allowed_ptr);
+			p->soft_cpus_version = tg->soft_cpus_version;
+		}
+		if (!cpumask_empty(&p->cpus_allowed_alt))
+			return &p->cpus_allowed_alt;
+	}
+	return p->cpus_ptr;
+}
+
+static inline void tg_inc_soft_cpus_version(struct task_group *tg)
+{
+	tg->soft_cpus_version++;
+	if (unlikely(tg->soft_cpus_version < 0))
+		tg->soft_cpus_version = 0;
+}
+
+extern void sched_init_group_balancer_sched_domains(void);
+extern void sched_clear_group_balancer_sched_domains(void);
+extern void tg_set_specs_ratio(struct task_group *tg);
+extern int attach_tg_to_group_balancer_sched_domain(struct task_group *tg,
+						    struct group_balancer_sched_domain *target,
+						    bool enable);
+extern void detach_tg_from_group_balancer_sched_domain(struct task_group *tg, bool disable);
+extern void update_group_balancer_root_cpumask(void);
+extern void tg_specs_change(struct task_group *tg);
+extern unsigned long cfs_h_load(struct cfs_rq *cfs_rq);
+extern bool gb_cpu_overutilized(int cpu);
+extern void gb_load_balance(struct lb_env *env);
+extern void gb_task_tick(struct task_struct *p);
+#else
+static inline const struct cpumask *task_allowed_cpu(struct task_struct *p)
+{
+	return p->cpus_ptr;
+}
+static inline void tg_set_specs_ratio(struct task_group *tg) { }
+static inline void update_group_balancer_root_cpumask(void) { }
+static inline void tg_specs_change(struct task_group *tg) { }
+#ifdef CONFIG_SMP
+static inline void gb_load_balance(struct lb_env *env) { }
+#endif
+static inline void gb_task_tick(struct task_struct *p) { }
+#endif
 #endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5a6d327dd68357a1c0fab015b3f4391080919ffc..89ce81a0d7359b4bdb5095cceb81b69fe8fbde7a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2081,6 +2081,17 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SCHED_ACPU*/
+#ifdef CONFIG_GROUP_BALANCER
+	{
+		.procname	= "sched_group_balancer",
+		.data		= &sysctl_sched_group_balancer_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_group_balancer_enable_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
 #ifdef CONFIG_RICH_CONTAINER
 	{
 		.procname	= "rich_container_enable",