diff --git a/tools/lib/perf/include/perf/bpf_perf.h b/tools/lib/perf/include/perf/bpf_perf.h
index e7cf6ba7b674bc94041cc28f879e7678ec2f2820..64c8d211726d7f6831dffedcdd8025536a201322 100644
--- a/tools/lib/perf/include/perf/bpf_perf.h
+++ b/tools/lib/perf/include/perf/bpf_perf.h
@@ -23,6 +23,7 @@
 struct perf_event_attr_map_entry {
 	__u32 link_id;
 	__u32 diff_map_id;
+	__u8 supported;
 };
 
 /* default attr_map name */
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 9692ebdd7f11e9aaa66525e6a6d3db172cab827e..712c170eb7781bba5d54259d5cad0d0535d7c6e4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2624,6 +2624,7 @@ int cmd_stat(int argc, const char **argv)
 	} else if (big_num_opt == 0) /* User passed --no-big-num */
 		stat_config.big_num = false;
 
+	target.inherit = !stat_config.no_inherit;
 	err = target__validate(&target);
 	if (err) {
 		target__strerror(&target, err, errbuf, BUFSIZ);
diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh
index a87bb2814b4c6de8395219d268eaa15fb21b6e15..d5c135225cc184e27d49ef147968e61a6151ae73 100755
--- a/tools/perf/tests/shell/stat_bpf_counters.sh
+++ b/tools/perf/tests/shell/stat_bpf_counters.sh
@@ -4,42 +4,71 @@
 
 set -e
 
+workload="perf test -w sqrtloop"
+
 # check whether $2 is within +/- 10% of $1
 compare_number()
 {
-       first_num=$1
-       second_num=$2
-
-       # upper bound is first_num * 110%
-       upper=$(expr $first_num + $first_num / 10 )
-       # lower bound is first_num * 90%
-       lower=$(expr $first_num - $first_num / 10 )
-
-       if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then
-               echo "The difference between $first_num and $second_num are greater than 10%."
-               exit 1
-       fi
+	first_num=$1
+	second_num=$2
+
+	# upper bound is first_num * 120%
+	upper=$(expr $first_num + $first_num / 5 )
+	# lower bound is first_num * 80%
+	lower=$(expr $first_num - $first_num / 5 )
+
+	if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then
+		echo "The difference between $first_num and $second_num are greater than 20%."
+		exit 1
+	fi
+}
+
+check_counts()
+{
+	base_instructions=$1
+	bpf_instructions=$2
+
+	if [ "$base_instructions" = "<not" ]; then
+		echo "Skipping: instructions event not counted"
+		exit 2
+	fi
+	if [ "$bpf_instructions" = "<not" ]; then
+		echo "Failed: instructions not counted with --bpf-counters"
+		exit 1
+	fi
+}
+
+test_bpf_counters()
+{
+	printf "Testing --bpf-counters "
+	base_instructions=$(perf stat --no-big-num -e instructions -- $workload 2>&1 | awk '/instructions/ {print $1}')
+	bpf_instructions=$(perf stat --no-big-num --bpf-counters -e instructions -- $workload  2>&1 | awk '/instructions/ {print $1}')
+	check_counts $base_instructions $bpf_instructions
+	compare_number $base_instructions $bpf_instructions
+	echo "[Success]"
+}
+
+test_bpf_modifier()
+{
+	printf "Testing bpf event modifier "
+	stat_output=$(perf stat --no-big-num -e instructions/name=base_instructions/,instructions/name=bpf_instructions/b -- $workload 2>&1)
+	base_instructions=$(echo "$stat_output"| awk '/base_instructions/ {print $1}')
+	bpf_instructions=$(echo "$stat_output"| awk '/bpf_instructions/ {print $1}')
+	check_counts $base_instructions $bpf_instructions
+	compare_number $base_instructions $bpf_instructions
+	echo "[Success]"
 }
 
 # skip if --bpf-counters is not supported
-if ! perf stat -e cycles --bpf-counters true > /dev/null 2>&1; then
+if ! perf stat -e instructions --bpf-counters true > /dev/null 2>&1; then
 	if [ "$1" = "-v" ]; then
 		echo "Skipping: --bpf-counters not supported"
-		perf --no-pager stat -e cycles --bpf-counters true || true
+		perf --no-pager stat -e instructions --bpf-counters true || true
 	fi
 	exit 2
 fi
 
-base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
-if [ "$base_cycles" = "<not" ]; then
-	echo "Skipping: cycles event not counted"
-	exit 2
-fi
-bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
-if [ "$bpf_cycles" = "<not" ]; then
-	echo "Failed: cycles not counted with --bpf-counters"
-	exit 1
-fi
+test_bpf_counters
+test_bpf_modifier
 
-compare_number $base_cycles $bpf_cycles
 exit 0
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 6732cbbcf9b3c6db5101a1f4c397073557b35b02..6e54b63187669b4c4b2d7af5e3b41ecce14d45ba 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -28,7 +28,7 @@
 #include "bpf_skel/bperf_leader.skel.h"
 #include "bpf_skel/bperf_follower.skel.h"
 
-#define ATTR_MAP_SIZE 16
+#define ATTR_MAP_SIZE 100
 
 static inline void *u64_to_ptr(__u64 ptr)
 {
@@ -391,6 +391,7 @@ static int bperf_check_target(struct evsel *evsel,
 }
 
 static	struct perf_cpu_map *all_cpu_map;
+static __u32 filter_entry_cnt;
 
 static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
 				       struct perf_event_attr_map_entry *entry)
@@ -422,18 +423,19 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
 	diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
 	entry->link_id = bpf_link_get_id(link_fd);
 	entry->diff_map_id = bpf_map_get_id(diff_map_fd);
-	err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
-	assert(err == 0);
-
-	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
-	assert(evsel->bperf_leader_link_fd >= 0);
-
 	/*
 	 * save leader_skel for install_pe, which is called within
 	 * following evsel__open_per_cpu call
 	 */
 	evsel->leader_skel = skel;
-	evsel__open_per_cpu(evsel, all_cpu_map, -1);
+	if (!evsel__open_per_cpu(evsel, all_cpu_map, -1))
+		entry->supported = true;
+
+	err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
+	assert(err == 0);
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
+	assert(evsel->bperf_leader_link_fd >= 0);
 
 out:
 	bperf_leader_bpf__destroy(skel);
@@ -441,12 +443,38 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
 	return err;
 }
 
+static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
+					 enum bperf_filter_type filter_type,
+					 bool inherit)
+{
+	struct bpf_link *link;
+	int err = 0;
+
+	if ((filter_type == BPERF_FILTER_PID ||
+	    filter_type == BPERF_FILTER_TGID) && inherit)
+		/* attach all follower bpf progs to enable event inheritance */
+		err = bperf_follower_bpf__attach(skel);
+	else {
+		link = bpf_program__attach(skel->progs.fexit_XXX);
+		if (IS_ERR(link))
+			err = PTR_ERR(link);
+	}
+
+	return err;
+}
+
 static int bperf__load(struct evsel *evsel, struct target *target)
 {
-	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
+	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff, false};
 	int attr_map_fd, diff_map_fd = -1, err;
 	enum bperf_filter_type filter_type;
-	__u32 filter_entry_cnt, i;
+	__u32 i;
+
+	if (evsel->evlist->core.nr_entries > ATTR_MAP_SIZE) {
+		pr_err("Too many events, please limit to %d or less\n",
+			ATTR_MAP_SIZE);
+		return -1;
+	}
 
 	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
 		return -1;
@@ -485,6 +513,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 		err = -1;
 		goto out;
 	}
+	evsel->supported = entry.supported;
 	/*
 	 * The bpf_link holds reference to the leader program, and the
 	 * leader program holds reference to the maps. Therefore, if
@@ -526,9 +555,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 	/* set up reading map */
 	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
 				 filter_entry_cnt);
-	/* set up follower filter based on target */
-	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
-				 filter_entry_cnt);
 	err = bperf_follower_bpf__load(evsel->follower_skel);
 	if (err) {
 		pr_err("Failed to load follower skeleton\n");
@@ -540,6 +566,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 	for (i = 0; i < filter_entry_cnt; i++) {
 		int filter_map_fd;
 		__u32 key;
+		struct bperf_filter_value fval = { i, 0 };
 
 		if (filter_type == BPERF_FILTER_PID ||
 		    filter_type == BPERF_FILTER_TGID)
@@ -550,12 +577,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 			break;
 
 		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
-		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
 	}
 
 	evsel->follower_skel->bss->type = filter_type;
+	evsel->follower_skel->bss->inherit = target->inherit;
 
-	err = bperf_follower_bpf__attach(evsel->follower_skel);
+	err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
+					    target->inherit);
 
 out:
 	if (err && evsel->bperf_leader_link_fd >= 0)
@@ -620,7 +649,7 @@ static int bperf__read(struct evsel *evsel)
 	bperf_sync_counters(evsel);
 	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
 
-	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+	for (i = 0; i < filter_entry_cnt; i++) {
 		struct perf_cpu entry;
 		__u32 cpu;
 
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
index f193998530d431d828eb0ebee6840e166eb6aae7..0595063139a3d5555f6e9cc2314769be1a476d18 100644
--- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -5,6 +5,8 @@
 #include <bpf/bpf_tracing.h>
 #include "bperf_u.h"
 
+#define MAX_ENTRIES 102400
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(__u32));
@@ -22,25 +24,29 @@ struct {
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u32));
-	__uint(value_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bperf_filter_value));
+	__uint(max_entries, MAX_ENTRIES);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
 } filter SEC(".maps");
 
 enum bperf_filter_type type = 0;
 int enabled = 0;
+int inherit;
 
 SEC("fexit/XXX")
 int BPF_PROG(fexit_XXX)
 {
 	struct bpf_perf_event_value *diff_val, *accum_val;
 	__u32 filter_key, zero = 0;
-	__u32 *accum_key;
+	__u32 accum_key;
+	struct bperf_filter_value *fval;
 
 	if (!enabled)
 		return 0;
 
 	switch (type) {
 	case BPERF_FILTER_GLOBAL:
-		accum_key = &zero;
+		accum_key = zero;
 		goto do_add;
 	case BPERF_FILTER_CPU:
 		filter_key = bpf_get_smp_processor_id();
@@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
 		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
 		break;
 	case BPERF_FILTER_TGID:
-		filter_key = bpf_get_current_pid_tgid() >> 32;
+		/* Use pid as the filter_key to exclude new task counts
+		 * when inherit is disabled. Don't worry about the existing
+		 * children in TGID losing their counts, bpf_counter has
+		 * already added them to the filter map via perf_thread_map
+		 * before this bpf prog runs.
+		 */
+		filter_key = inherit ?
+			     bpf_get_current_pid_tgid() >> 32 :
+			     bpf_get_current_pid_tgid() & 0xffffffff;
 		break;
 	default:
 		return 0;
 	}
 
-	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
-	if (!accum_key)
+	fval = bpf_map_lookup_elem(&filter, &filter_key);
+	if (!fval)
 		return 0;
 
+	accum_key = fval->accum_key;
+	if (fval->exited)
+		bpf_map_delete_elem(&filter, &filter_key);
+
 do_add:
 	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
 	if (!diff_val)
 		return 0;
 
-	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+	accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
 	if (!accum_val)
 		return 0;
 
@@ -75,4 +93,70 @@ int BPF_PROG(fexit_XXX)
 	return 0;
 }
 
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/task_newtask")
+int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
+{
+	__u32 parent_key, child_key;
+	struct bperf_filter_value *parent_fval;
+	struct bperf_filter_value child_fval = { 0 };
+
+	if (!enabled)
+		return 0;
+
+	switch (type) {
+	case BPERF_FILTER_PID:
+		parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
+		child_key = task->pid;
+		break;
+	case BPERF_FILTER_TGID:
+		parent_key = bpf_get_current_pid_tgid() >> 32;
+		child_key = task->tgid;
+		if (child_key == parent_key)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+
+	/* Check if the current task is one of the target tasks to be counted */
+	parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
+	if (!parent_fval)
+		return 0;
+
+	/* Start counting for the new task by adding it into filter map,
+	 * inherit the accum key of its parent task so that they can be
+	 * counted together.
+	 */
+	child_fval.accum_key = parent_fval->accum_key;
+	child_fval.exited = 0;
+	bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
+
+	return 0;
+}
+
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/sched_process_exit")
+int BPF_PROG(on_exittask, struct task_struct *task)
+{
+	__u32 pid;
+	struct bperf_filter_value *fval;
+
+	if (!enabled)
+		return 0;
+
+	/* Stop counting for this task by removing it from filter map.
+	 * For TGID type, if the pid can be found in the map, it means that
+	 * this pid belongs to the leader task. After the task exits, the
+	 * tgid of its child tasks (if any) will be 1, so the pid can be
+	 * safely removed.
+	 */
+	pid = task->pid;
+	fval = bpf_map_lookup_elem(&filter, &pid);
+	if (fval)
+		fval->exited = 1;
+
+	return 0;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
index 1ce0c2c905c112d68a0548cdaf484e20432cb459..4a4a753980bef8f03c82f9fbe529757866d4f8ca 100644
--- a/tools/perf/util/bpf_skel/bperf_u.h
+++ b/tools/perf/util/bpf_skel/bperf_u.h
@@ -11,4 +11,9 @@ enum bperf_filter_type {
 	BPERF_FILTER_TGID,
 };
 
+struct bperf_filter_value {
+	__u32 accum_key;
+	__u8 exited;
+};
+
 #endif /* __BPERF_STAT_U_H */
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index d582cae8e1051a606d0b5b383584adf77c4df7b0..2ee2cc30340f0f1cc89018e508a6a8e64dbd5b4a 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -17,6 +17,7 @@ struct target {
 	bool	     default_per_cpu;
 	bool	     per_thread;
 	bool	     use_bpf;
+	bool	     inherit;
 	int	     initial_delay;
 	const char   *attr_map;
 };