diff --git a/README.en.md b/README.en.md
index efa7d54dcd2827eb86bd9379f220485a30bc27df..503138f40891afa6ed46b2bf9e2b6097ea9a926a 100644
--- a/README.en.md
+++ b/README.en.md
@@ -262,7 +262,7 @@ func main() {
 	}
 
 	for _, o := range dataVo.GoData {
-    fmt.Printf("event: %v count: %v", o.Evt, o.Count)
+    fmt.Printf("event: %v count: %v\n", o.Evt, o.Count)
 	}
 	kperf.PmuDataFree(dataVo)
 	kperf.PmuClose(fd)
diff --git a/README.md b/README.md
index d511787ac9d9b59ff88b05579dffff89e359561d..ee59dc44420dffe8ec31da173d09919ef0fc8366 100644
--- a/README.md
+++ b/README.md
@@ -237,7 +237,7 @@ func main() {
 	}
 
 	for _, o := range dataVo.GoData {
-    fmt.Printf("event: %v count: %v", o.Evt, o.Count)
+    fmt.Printf("event: %v count: %v\n", o.Evt, o.Count)
 	}
 	kperf.PmuDataFree(dataVo)
 	kperf.PmuClose(fd)
diff --git a/docs/Details_Usage.md b/docs/Details_Usage.md
index 70c50539446da88fdab01ad095469d65ac13c3e1..815f4d86f993ef635022ab2bd9e58e608e166ea4 100644
--- a/docs/Details_Usage.md
+++ b/docs/Details_Usage.md
@@ -600,162 +600,230 @@ pmu_attr = kperf.PmuAttr(evtList=evtList, includeNewFork=True)
 注意，该功能是针对Counting模式，因为Sampling和SPE Sampling本身就会采集子线程的数据。
 
 ### 采集DDRC带宽
-基于uncore事件可以计算DDRC的访存带宽，不同硬件平台有不同的计算方式。
-鲲鹏芯片上的访存带宽公式可以参考openeuler kernel的tools/perf/pmu-events/arch/arm64/hisilicon/hip09/sys/uncore-ddrc.json：
-```json
-   {
-	"MetricExpr": "flux_wr * 32 / duration_time",
-	"BriefDescription": "Average bandwidth of DDRC memory write(Byte/s)",
-	"Compat": "0x00000030",
-	"MetricGroup": "DDRC",
-	"MetricName": "ddrc_bw_write",
-	"Unit": "hisi_sccl,ddrc"
-   },
-   {
-	"MetricExpr": "flux_rd * 32 / duration_time",
-	"BriefDescription": "Average bandwidth of DDRC memory read(Byte/s)",
-	"Compat": "0x00000030",
-	"MetricGroup": "DDRC",
-	"MetricName": "ddrc_bw_read",
-	"Unit": "hisi_sccl,ddrc"
-   },
-```
-
-根据公式，采集flux_wr和flux_rd事件，用于计算带宽：
+鲲鹏上提供了DDRC的pmu设备，用于采集DDR的性能数据，比如带宽等。libkperf提供了API，用于获取每个numa的DDR带宽数据。
+
+参考代码：
 ```c++
 // c++代码示例
-   
-    vector<char *> evts = {
-        "hisi_sccl1_ddrc/flux_rd/",
-        "hisi_sccl3_ddrc/flux_rd/",
-        "hisi_sccl5_ddrc/flux_rd/",
-        "hisi_sccl7_ddrc/flux_rd/",
-        "hisi_sccl1_ddrc/flux_wr/",
-        "hisi_sccl3_ddrc/flux_wr/",
-        "hisi_sccl5_ddrc/flux_wr/",
-        "hisi_sccl7_ddrc/flux_wr/"
-    }; // 采集hisi_scclX_ddrc设备下的flux_rd和flux_wr，具体设备名称因硬件而异，可以在/sys/devices/下查询。
-
-    PmuAttr attr = {0};
-    attr.evtList = evts.data();
-    attr.numEvt = evts.size();
-
-    int pd = PmuOpen(COUNTING, &attr);
-    if (pd == -1) {
-        cout << Perror() << "\n";
-        return;
-    }
+PmuDeviceAttr devAttr[2];
+// DDR读带宽
+devAttr[0].metric = PMU_DDR_READ_BW;
+// DDR写带宽
+devAttr[1].metric = PMU_DDR_WRITE_BW;
+// 初始化采集任务
+int pd = PmuDeviceOpen(devAttr, 2);
+// 开始采集
+PmuEnable(pd);
+sleep(1);
+// 读取原始信息
+PmuData *oriData = nullptr;
+int oriLen = PmuRead(pd, &oriData);
+PmuDeviceData *devData = nullptr;
+auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData);
+// 对于4个numa的服务器，devData的长度为8.前4个是读带宽，后4个是写带宽。
+for (int i=0;i<4;++i) {
+    // numaId表示数据对应的numa节点。
+    // count是距离上次采集的DDR总读/写包长，单位是Byte，
+    // 需要除以时间间隔得到带宽（这里的时间间隔是1秒）。
+    cout << "read bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n";
+}
+for (int i=4;i<8;++i) {
+    cout << "write bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n";
+}
+DevDataFree(devData);
+PmuDataFree(oriData);
+PmuDisable(pd);
+```
 
-    PmuEnable(pd);
-    for (int i=0;i<60;++i) {
-        sleep(1);
-        PmuData *data = nullptr;
-        int len = PmuRead(pd, &data);
-        // 有8个uncore事件，所以data的长度等于8.
-        // 前4个是4个numa的read带宽，后4个是4个numa的write带宽。
-        for (int j=0;j<4;++j) {
-            printf("read bandwidth: %f M/s\n", (float)data[j].count*32/1024/1024);
-        }
-        for (int j=4;j<8;++j) {
-            printf("write bandwidth: %f M/s\n", (float)data[j].count*32/1024/1024);
-        }
-        PmuDataFree(data);
+```python
+# python代码示例
+dev_attr = [
+    kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_READ_BW),
+    kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW)
+]
+pd = kperf.device_open(dev_attr)
+kperf.enable(pd)
+time.sleep(1)
+kperf.disable(pd)
+ori_data = kperf.read(pd)
+dev_data = kperf.get_device_metric(ori_data, dev_attr)
+for data in dev_data.iter:
+    if data.metric == kperf.PmuDeviceMetric.PMU_DDR_READ_BW:
+        print(f"read bandwidth({data.numaId}): {data.count/1024/1024} M/s")
+    if data.metric == kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW:
+        print(f"write bandwidth({data.numaId}): {data.count/1024/1024} M/s")
+```
+
+```go
+// go代码用例
+deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_DDR_READ_BW}, kperf.PmuDeviceAttr{Metric: kperf.PMU_DDR_WRITE_BW}}
+fd, _ := kperf.PmuDeviceOpen(deviceAttrs)
+kperf.PmuEnable(fd)
+time.Sleep(1 * time.Second)
+kperf.PmuDisable(fd)
+dataVo, _ := kperf.PmuRead(fd)
+deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs)
+for _, v := range deivceDataVo.GoDeviceData {
+    if v.Metric == kperf.PMU_DDR_READ_BW {
+	    fmt.Printf("read bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024)
+    }
+    if v.Metric == kperf.PMU_DDR_WRITE_BW {
+	    fmt.Printf("write bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024)
     }
-    PmuDisable(pd);
-    PmuClose(pd);
+}
+kperf.DevDataFree(deivceDataVo)
+kperf.PmuDataFree(dataVo)
+kperf.PmuClose(fd)
+```
+
+执行上述代码，输出的结果类似如下：
+```
+read bandwidth(0): 17.32 M/s
+read bandwidth(1): 5.43 M/s
+read bandwidth(2): 2.83 M/s
+read bandwidth(3): 4.09 M/s
+write bandwidth(0): 4.35 M/s
+write bandwidth(1): 2.29 M/s
+write bandwidth(2): 0.84 M/s
+write bandwidth(3): 0.97 M/s
+```
+
+### 采集L3 cache的时延
+libkperf提供了采集L3 cache平均时延的能力，用于分析访存型应用的性能瓶颈。  
+采集是以cluster为粒度，每个cluster包含4个cpu core（如果开启了超线程则是8个），可以通过PmuGetClusterCore来获取cluster id对应的core id。
+
+参考代码：
+```c++
+// c++代码示例
+PmuDeviceAttr devAttr[1];
+// L3平均时延
+devAttr[0].metric = PMU_L3_LAT;
+// 初始化采集任务
+int pd = PmuDeviceOpen(devAttr, 1);
+// 开始采集
+PmuEnable(pd);
+sleep(1);
+PmuData *oriData = nullptr;
+int oriLen = PmuRead(pd, &oriData);
+PmuDeviceData *devData = nullptr;
+auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData);
+// devData的长度等于cluster个数
+for (int i=0;i<len;++i) {
+    // 每个devData表示一个cluster的L3平均时延，是以cycles为单位
+    cout << "L3 latency(" << devData[i].clusterId << "): " << devData[i].count<< " cycles\n";
+}
+DevDataFree(devData);
+PmuDataFree(oriData);
+PmuDisable(pd);
 ```
 
 ```python
 # python代码示例
-    import kperf
-    import time
-    evtList = [ "hisi_sccl1_ddrc/flux_rd/",
-        "hisi_sccl3_ddrc/flux_rd/",
-        "hisi_sccl5_ddrc/flux_rd/",
-        "hisi_sccl7_ddrc/flux_rd/",
-        "hisi_sccl1_ddrc/flux_wr/",
-        "hisi_sccl3_ddrc/flux_wr/",
-        "hisi_sccl5_ddrc/flux_wr/",
-        "hisi_sccl7_ddrc/flux_wr/"] # 采集hisi_scclX_ddrc设备下的flux_rd和flux_wr，具体设备名称因硬件而异，可以在/sys/devices/下查询。
-    
-    pmu_attr = kperf.PmuAttr(evtList=evtList) 
-    pd = kperf.open(kperf.PmuTaskType.COUNTING, pmu_attr)
-    if pd == -1:
-        print(kperf.error())
-        exit(1)
-    kperf.enable(pd)
-    for i in range(60):
-        time.sleep(1)
-        pmu_data = kperf.read(pd)
-        j = 0
-        for data in pmu_data.iter:
-            bandwidth = data.count*32/1024/1024
-            if j < 4:
-              print(f"read bandwidth: {bandwidth} M/s\n")
-            if j >= 4 and j < 8:
-              print(f"write bandwidth: {bandwidth} M/s\n")
-            j += 1
-    kperf.disable(pd)
-    kperf.close(pd)
+dev_attr = [
+    kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_LAT)
+]
+pd = kperf.device_open(dev_attr)
+kperf.enable(pd)
+time.sleep(1)
+kperf.disable(pd)
+ori_data = kperf.read(pd)
+dev_data = kperf.get_device_metric(ori_data, dev_attr)
+for data in dev_data.iter:
+    print(f"L3 latency({data.clusterId}): {data.count} cycles")
 ```
 
 ```go
 // go代码用例
-import "libkperf/kperf"
-import "time"
-import "fmt"
+deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_L3_LAT}}
+fd, _ := kperf.PmuDeviceOpen(deviceAttrs)
+kperf.PmuEnable(fd)
+time.Sleep(1 * time.Second)
+kperf.PmuDisable(fd)
+dataVo, _ := kperf.PmuRead(fd)
+deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs)
+for _, v := range deivceDataVo.GoDeviceData {
+	fmt.Printf("L3 latency(%v): %v cycles\n", v.ClusterId, v.Count)
+}
+kperf.DevDataFree(deivceDataVo)
+kperf.PmuDataFree(dataVo)
+kperf.PmuClose(fd)
+```
 
-func main() {
-    evtList := []string{"hisi_sccl1_ddrc/flux_rd/",
-        "hisi_sccl3_ddrc/flux_rd/",
-        "hisi_sccl5_ddrc/flux_rd/",
-        "hisi_sccl7_ddrc/flux_rd/",
-        "hisi_sccl1_ddrc/flux_wr/",
-        "hisi_sccl3_ddrc/flux_wr/",
-        "hisi_sccl5_ddrc/flux_wr/",
-        "hisi_sccl7_ddrc/flux_wr/"}
-    attr := kperf.PmuAttr{EvtList: evtList}
-    pd, err := kperf.PmuOpen(kperf.COUNT, attr)
-    if err != nil {
-        fmt.Printf("kperf pmuopen sample failed, expect err is nil, but is %v\n", err)
-        return
-    }
-    kperf.PmuEnable(pd)
+执行上述代码，输出的结果类似如下：
+```
+L3 latency(0): 101 cycles
+L3 latency(1): 334.6 cycles
+L3 latency(2): 267.8 cycles
+L3 latency(3): 198.4 cycles
+...
+```
 
-    for i := 0; i < 60; i++ {
-        time.Sleep(time.Second)
-         dataVo, err := kperf.PmuRead(pd)
-        if err != nil {
-            fmt.Printf("kperf pmuread failed, expect err is nil, but is %v\n", err)
-        }
+### 采集PCIE带宽
+libkperf提供了采集PCIE带宽的能力，采集tx和rx方向的读写带宽，用于监控外部设备（nvme、gpu等）的带宽。
+并不是所有的PCIE设备都可以被采集带宽，鲲鹏的pmu设备只覆盖了一部分PCIE设备，可以通过PmuDeviceBdfList来获取当前环境可采集的PCIE设备或Root port。
 
-        j := 0
-        for _, o := range dataVo.GoData {
-            bandwith := o.Count * 32 / 1024 / 1024
-            if j < 4 {
-                fmt.Printf("read bandwidth: %v M/s\n", bandwith)
-            }
-            if j >= 4 && j < 8 {
-                fmt.Printf("write bandwidth: %v M/s\n", bandwith)
-            }
-            j += 1
-        }
-    }
-    kperf.PmuDisable(pd)
-    kperf.PmuClose(pd)
+参考代码：
+```c++
+// c++代码示例
+PmuDeviceAttr devAttr[1];
+// 采集PCIE设备RX的读带宽
+devAttr[0].metric = PMU_PCIE_RX_MRD_BW;
+// 设置PCIE的bdf号
+devAttr[0].bdf = "16:04.0";
+// 初始化采集任务
+int pd = PmuDeviceOpen(devAttr, 1);
+// 开始采集
+PmuEnable(pd);
+sleep(1);
+PmuData *oriData = nullptr;
+int oriLen = PmuRead(pd, &oriData);
+PmuDeviceData *devData = nullptr;
+auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData);
+// devData的长度等于pcie设备的个数
+for (int i=0;i<len;++i) {
+    // 带宽的单位是Bytes/ns
+    cout << "pcie bw(" << devData[i].bdf << "): " << devData[i].count<< " Bytes/ns\n";
 }
+DevDataFree(devData);
+PmuDataFree(oriData);
+PmuDisable(pd);
+```
+
+```python
+# python代码示例
+dev_attr = [
+    kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_PCIE_RX_MRD_BW, bdf="16:04.0")
+]
+pd = kperf.device_open(dev_attr)
+kperf.enable(pd)
+time.sleep(1)
+kperf.disable(pd)
+ori_data = kperf.read(pd)
+dev_data = kperf.get_device_metric(ori_data, dev_attr)
+for data in dev_data.iter:
+    print(f"pcie bw({data.bdf}): {data.count} Bytes/ns")
+```
+
+```go
+// go代码用例
+deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_PCIE_RX_MRD_BW, Bdf: "16:04.0"}}
+fd, _ := kperf.PmuDeviceOpen(deviceAttrs)
+kperf.PmuEnable(fd)
+time.Sleep(1 * time.Second)
+kperf.PmuDisable(fd)
+dataVo, _ := kperf.PmuRead(fd)
+deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs)
+for _, v := range deivceDataVo.GoDeviceData {
+	fmt.Printf("pcie bw(%v): %v Bytes/ns\n", v.Bdf, v.Count)
+}
+kperf.DevDataFree(deivceDataVo)
+kperf.PmuDataFree(dataVo)
+kperf.PmuClose(fd)
 ```
 
 执行上述代码，输出的结果类似如下：
 ```
-read bandwidth: 17.32 M/s
-read bandwidth: 5.43 M/s
-read bandwidth: 2.83 M/s
-read bandwidth: 4.09 M/s
-write bandwidth: 4.35 M/s
-write bandwidth: 2.29 M/s
-write bandwidth: 0.84 M/s
-write bandwidth: 0.97 M/s
+pcie bw(16:04.0): 124122412 Bytes/ns
 ```
 
 ### 采集系统调用函数耗时信息
@@ -965,67 +1033,49 @@ func main() {
 0x400838->0x400804 1
 ```
 
-### Blocked Sample采样
+### IO和计算热点混合采样(Blocked Sample)
 Blocked Sample是一种新增的采样模式，该模式下会同时采集进程处于on cpu和off cpu数据，通过配置blockedSample字段去进行使能，去同时采集cycles和context-switches事件，换算off cpu的period数据。
 
-说明：
-
-1、只支持SAMPLING模式采集
-
-2、只支持对进程分析，不支持对系统分析
-
-使用示例：
-```bash
-cd example
-# 运行C++用例，并分析热点
-bash run.sh all
-# 运行python用例，并分析热点
-bash run.sh all python=true
+详细使用方法可以参考example/pmu_hotspot.cpp
+编译命令：
 ```
-
-### Uncore事件采集能力增强
-1、支持可配置化uncore事件配置，比如如下形式进行事件配置：
-```bash
-smmuv3_pmcg_100020/transaction,filter_enable=1,filter_stream_id=0x7d/
+g++ -g pmu_hotspot.cpp -o pmu_hotspot -I /path/to/libkperf/include -L /path/to/libkperf/lib -lkperf -lsym
 ```
 
-2、支持采集和查询L3、DDR、SMMU、PCIE性能数据，采集如下性能数据：
-- 每个core的L3带宽、hit、miss，支持920和920高性能版
-- 每个numa的L3 latency，支持920高性能版
-- 每个numa的DDR读写带宽，支持920和920高性能版
-- 指定bdf号的smmu的地址转换次数，支持920和920高性能版
-- 指定bdf号的pcie rx、tx方向的读写带宽，支持920高性能版
+对于例子：
+```
+thread1:
+    busy_io
+        compute
+        while
+            write
+            fsync
+thread2
+    cpu_compute
+        while
+            compute
+```
+既包含计算(compute)也包含IO(write, fsync)，如果用perf采集，只能采集到on cpu的数据：
+|overhead|Shared Object|Symbol|
+|--------|-------------|------|
+|99.94%|test_io|compute|
+|0.03%|libpthread-2.17.so|__pthread_enable_asynccancel|
+|0.00%|test_io|busy_io|
 
-代码示例：
-```C++
-    // C++ 代码示例
-    PmuDeviceAttr devAttr = {};
-    devAttr.metric = PMU_L3_TRAFFIC;
-    int pd = PmuDeviceOpen(&devAttr, 1);
+使用pmu_hotspot采集：
+```
+pmu_hotspot 5 1 1 <test>
+```
 
-    PmuEnable(pd);
-    sleep(1);
-    PmuDisable(pd);
+输出结果：
+|overhead|Shared Object|Symbol|
+|--------|-------------|------|
+|54.74%|libpthread-2.17.so|fsync|
+|27.18%|test_io|compute|
+采集到了fsync，得知该进程的IO占比大于计算占比。
 
-    PmuData* oriData = nullptr;
-    int oriLen = PmuRead(pd, &oriData);
+限制：
 
-    PmuDeviceData *devData = nullptr;
-    auto len = PmuGetDevMetric(oriData, oriLen, &devAttr, 1, &devData);
-```
+1、只支持SAMPLING模式采集
 
-```python
-    # python 代码示例
-    dev_attr = [
-        kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_TRAFFIC)
-    ]
-    pd = kperf.device_open(dev_attr)
-
-    kperf.enable(pd)
-    time.sleep(1)
-    kperf.disable(pd)
-    ori_data = kperf.read(pd)
-
-    
-    dev_data = kperf.get_device_metric(ori_data, dev_attr)
-```
\ No newline at end of file
+2、只支持对进程分析，不支持对系统分析
\ No newline at end of file
diff --git a/docs/Go_API.md b/docs/Go_API.md
index 4db378a4affd999e87cb5c3e43fed7375bbbd027..fa1ba8f8f9dba44ddcd052317675bcaaa663cb52 100644
--- a/docs/Go_API.md
+++ b/docs/Go_API.md
@@ -127,7 +127,7 @@ func PmuRead(fd int) (PmuDataVo, error)
   * Period uint64 采样间隔
   * Count uint64	计数
   * CountPercent float64  计数比值，使能时间/运行时间
-  * CpuTopo CpuTopolopy
+  * CpuTopo CpuTopology
     * CoreId 系统核ID
     * NumaId numa ID
     * SocketId socket ID
@@ -267,6 +267,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error)
 
   * FuncName string 系统调用函数名
   * ElapsedTime float64 耗时时间
+  * StartTs 开始时间戳
   * Pid int 进程id
   * Tid int 线程id
   * Cpu int cpu号
@@ -280,7 +281,7 @@ if err != nil {
 }
 
 for _, v := range traceList.GoTraceData {
-    fmt.Printf("funcName: %v, elapsedTime: %v ms pid: %v tid: %v, cpu: %v comm: %v\n", v.FuncName, v.ElapsedTime, v.Pid, v.Tid, v.Cpu, v.Comm)
+    fmt.Printf("funcName: %v, elapsedTime: %v ms startTs: %v pid: %v tid: %v, cpu: %v comm: %v\n", v.FuncName, v.ElapsedTime, v.StartTs, v.Pid, v.Tid, v.Cpu, v.Comm)
 }
 ```
 
@@ -307,3 +308,147 @@ func main() {
     }
 }
 ```
+
+### kperf.PmuDeviceBdfList
+
+func PmuDeviceBdfList(bdfType C.enum_PmuBdfType) ([]string, error) 从系统中查找所有的bdf列表
+* bdfType C.enum_PmuBdfType
+  * PMU_BDF_TYPE_PCIE  PCIE设备对应的bdf
+  * PMU_BDF_TYPE_SMMU  SMMU设备对应的bdf
+```go
+import "libkperf/kperf"
+import "fmt"
+
+func main() {
+   pcieBdfList, err := kperf.PmuDeviceBdfList(kperf.PMU_BDF_TYPE_PCIE)
+   if err != nil {
+      fmt.Printf("kperf GetDeviceBdfList failed, expect err is nil, but is %v\n", err)
+   } 
+   for _, v := range pcieBdfList {
+      fmt.Printf("bdf is %v\n", v)
+   }
+}
+```
+### kperf.PmuDeviceOpen
+
+func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) 初始化采集uncore事件指标的能力
+
+* type PmuDeviceAttr struct:
+  * Metic: 指定需要采集的指标
+    * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽，单位：Bytes
+    * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽，单位：Bytes
+    * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数，单位：Bytes
+    * PMU_L3_MISS 采集每个core的L3的miss数量，单位：count
+    * PMU_L3_REF 采集每个core的L3的总访问数量，单位：count
+    * PMU_L3_LAT 采集每个numa的L3的总时延，单位：cycles
+    * PMU_PCIE_RX_MRD_BW 采集pcie设备的rx方向上的读带宽，单位：Bytes/ns
+    * PMU_PCIE_RX_MWR_BW 采集pcie设备的rx方向上的写带宽，单位：Bytes/ns
+    * PMU_PCIE_TX_MRD_BW 采集pcie设备的tx方向上的读带宽，单位：Bytes/ns
+    * PMU_PCIE_TX_MWR_BW 采集pcie设备的tx方向上的读带宽，单位：Bytes/ns
+    * PMU_SMMU_TRAN 采集指定smmu设备的地址转换次数，单位：count
+  * Bdf: 指定需要采集设备的bdf号，只对pcie和smmu指标有效
+* 返回值是int和error，pd > 0表示初始化成功，pd == -1初始化失败，可通过kperf.error()查看错误信息，以下是一个kperf.device_open的示例
+
+```go
+import "libkperf/kperf"
+import "fmt"
+
+func main() {
+    deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_L3_LAT}}
+	fd, err := kperf.PmuDeviceOpen(deviceAttrs)
+	if err != nil {
+		fmt.Printf("kperf PmuDeviceOpen failed, expect err is nil, but is %v\n", err)
+	}
+}
+```
+
+### kperf.PmuGetDevMetric
+
+func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDataVo, error)  对原始read接口的数据，按照device_attr中给定的指标进行数据聚合接口，返回值是PmuDeviceData
+
+* type PmuDataVo struct: PmuRead接口返回的原始数据
+* []PmuDeviceAttr: 指定需要聚合的指标参数
+* typ PmuDeviceDataVo struct:
+  * GoDeviceData []PmuDeviceData
+* type PmuDeviceData struct:
+  * Metric C.enum_PmuDeviceMetric 采集的指标
+	* Count float64                 指标的计数值
+	* Mode C.enum_PmuMetricMode     指标的采集类型，按core、按numa还是按bdf号
+	* CoreId uint32                 数据的core编号
+	* NumaId uint32                 数据的numa编号
+	* ClusterId uint32              簇ID
+	* Bdf string                    数据的bdf编号
+
+### kperf.DevDataFree 
+
+func DevDataFree(devVo PmuDeviceDataVo)  清理PmuDeviceData的指针数据
+
+### kperf.PmuGetClusterCore
+
+func PmuGetClusterCore(clusterId uint) ([]uint, error) 查询指定clusterId下对应的core列表
+
+* clusterId CPU的clusterId编号
+* 返回值：当前clusterId下对应的core列表,出现错误则列表为空，且error不为空
+
+```go
+import "libkperf/kperf"
+import "fmt"
+
+func main() {
+  clusterId := uint(1)
+	coreList, err := kperf.PmuGetClusterCore(clusterId)
+	if err != nil {
+		fmt.Printf("kperf PmuGetClusterCore failed, expect err is nil, but is %v\n", err)
+    return
+	}
+	for _, v := range coreList {
+		fmt.Printf("coreId has:%v\n", v)
+	}
+}
+```
+
+### kperf.PmuGetNumaCore
+
+func PmuGetNumaCore(nodeId uint) ([]uint, error)  查询指定numaId下对应的core列表
+
+* nodeId numa对应的ID
+* 返回值为对应numaId下的cpu core列表，出现错误则列表为空，且error不为空
+
+```go
+import "libkperf/kperf"
+import "fmt"
+
+func main() {
+  nodeId := uint(0)
+	coreList, err := kperf.PmuGetNumaCore(nodeId)
+	if err != nil {
+		fmt.Printf("kperf PmuGetNumaCore failed, expect err is nil, but is %v\n", err)
+    return
+	}
+	for _, v := range coreList {
+		fmt.Printf("coreId has:%v\n", v)
+	}
+}
+```
+
+
+### kperf.PmuGetCpuFreq 
+func PmuGetCpuFreq(core	uint) (int64, error) 查询当前系统指定core的实时CPU频率
+
+* core cpu coreId
+* 返回值为int64, 时当前cpu core的实时频率，出现错误频率为-1，且error不为空
+
+```go
+import "libkperf/kperf"
+import "fmt"
+
+func main() {
+  coreId := uint(0)
+	freq, err := kperf.PmuGetCpuFreq(coreId)
+	if err != nil {
+		fmt.Printf("kperf PmuGetCpuFreq failed, expect err is nil, but is %v\n", err)
+    return
+	}
+	fmt.Printf("coreId %v freq is %v\n", coreId, freq)
+}
+```
\ No newline at end of file
diff --git a/go/src/libkperf/kperf/kperf.go b/go/src/libkperf/kperf/kperf.go
index cf10f715b4fa6eef5afd9458377d6b0737e8bb63..2b5958ca39adc2fd88a5f2a6e7eb43c12fd4a631 100644
--- a/go/src/libkperf/kperf/kperf.go
+++ b/go/src/libkperf/kperf/kperf.go
@@ -308,13 +308,13 @@ type PmuAttr struct {
 	CallStack bool                     // This indicates whether to collect whole callchains or only top frame
 	DataFilter C.enum_SpeFilter        // Spe Data Filter.Refer to comments of SpeFilter 
 	EvFilter C.enum_SpeEventFilter     // Spe Event filter.Refer to comments of SpeEventFilter
-	MinLatency uint64                  // Collect only smaples with latency or higher
-	IncludeNewFork bool                // enable it you can get the new child thread count, only in couting mode
-	BranchSampleFilter uint64          // if the filering mode is set, branch_sample_stack data is collected in sampling mode
-	BlockedSample bool                 // This indicates whether the blocked sample mode is enabled. In this mode, both on Cpu and off Cpu data is collectd
+	MinLatency uint64                  // Collect only samples with latency or higher
+	IncludeNewFork bool                // enable it you can get the new child thread count, only in counting mode
+	BranchSampleFilter uint64          // if the filter mode is set, branch_sample_stack data is collected in sampling mode
+	BlockedSample bool                 // This indicates whether the blocked sample mode is enabled. In this mode, both on Cpu and off Cpu data is collected
 }
 
-type CpuTopolopy struct {
+type CpuTopology struct {
 	CoreId int         // cpu core id
 	NumaId int		   // numa id
 	SocketId int       // socket id
@@ -338,12 +338,12 @@ type PmuData struct {
 	Ts uint64						   // time stamp. uint: ns
 	Pid int				               // process id
 	Tid int							   // thread id
-	Cpu int						   // cpu id
+	Cpu int						       // cpu id
 	Comm string						   // process command 
 	Period uint64                      // sample period
 	Count uint64					   // event count. Only available for counting
 	CountPercent float64               // event count Percent. when count = 0, countPercent = -1; Only available for counting
-	CpuTopo CpuTopolopy 			   // cpu topolopy
+	CpuTopo CpuTopology 			   // cpu topology
 	Symbols []sym.Symbol			   // symbol list
  	BranchRecords []BranchSampleRecord // branch record list
 	SpeExt SpeDataExt                  // SPE data
@@ -353,7 +353,7 @@ type PmuData struct {
 
 type PmuDataVo struct {
 	GoData []PmuData            // PmuData list
-	cData *C.struct_PmuData	    // Pointer to PmuData in inferface C
+	cData *C.struct_PmuData	    // Pointer to PmuData in interface C
 	fd int		                // fd
 }
 
@@ -374,10 +374,11 @@ type PmuTraceAttr struct {
 // PmuTraceData info
 type PmuTraceData struct {
 	FuncName string        // function name
+	StartTs  int64         // start timestamp. uint: us
 	ElapsedTime float64    // elapsed time
 	Pid int				   // process id
 	Tid int                // thread id
-	Cpu int			   // cpu id
+	Cpu int			   	   // cpu id
 	Comm string			   // process command
 }
 
@@ -390,7 +391,7 @@ type PmuTraceDataVo struct {
 type PmuDeviceAttr struct {
 	Metric C.enum_PmuDeviceMetric
 
-	// Used for PMU_PCIE_XXX and PMU_SMMU_XXX to collect a specifi pcie device.
+	// Used for PMU_PCIE_XXX and PMU_SMMU_XXX to collect a specific pcie device.
     // The string of bdf is something like '7a:01.0'.
 	Bdf string
 }
@@ -403,7 +404,7 @@ type PmuDeviceData struct {
 	Mode C.enum_PmuMetricMode  // Field of union depends on the above <mode>.
 	CoreId uint32    // for percore metric
 	NumaId uint32    // for pernuma metric
-	ClusterId uint32 // for percluster emtric
+	ClusterId uint32 // for percluster metric
 	Bdf string       // for perpcie metric
 }
 
@@ -548,7 +549,7 @@ func PmuEventList(eventType C.enum_PmuEventType) []string {
 // Enable counting or sampling of task <pd>.
 // On success, nil is returned.
 // On error, error is returned.
-// param pd task id
+// param fd task id
 // return error
 func PmuEnable(fd int) error {
 	rs := C.PmuEnable(C.int(fd))
@@ -561,7 +562,7 @@ func PmuEnable(fd int) error {
 // Disable counting or sampling of task <pd>.
 // On success, nil is returned.
 // On error, error is returned.
-// param pd task id
+// param fd task id
 // return err
 func PmuDisable(fd int) error {
 	rs := C.PmuDisable(C.int(fd))
@@ -613,7 +614,7 @@ func PmuDataFree(data PmuDataVo) {
 
 // Close task with id <pd>
 // After PmuClose is called, all pmu data related to the task become invalid
-// param pd task id
+// param fd task id
 func PmuClose(fd int) {
 	if fd <= 0 {
 		return
@@ -627,7 +628,7 @@ func PmuClose(fd int) {
 }
 
 // stop a sampling task in asynchronous mode
-// param pd pmu descriptor.
+// param fd pmu descriptor.
 func PmuStop(fd int) {
 	if fd <= 0 {
 		return
@@ -641,7 +642,7 @@ func PmuStop(fd int) {
 // That is to say, for COUNTING, counts of all pmu event are reset to zero in PmuRead
 // For SAMPLING and SPE_SAMPLING, samples collected are started from the last PmuEnable or PmuRead
 // On success, PmuDataVo is returned
-// param pd task id
+// param fd task id
 // return PmuDataVo and error
 func PmuRead(fd int) (PmuDataVo, error) {
 	pmuDataVo := PmuDataVo{}
@@ -762,7 +763,7 @@ func PmuTraceOpen(traceType C.enum_PmuTraceType, traceAttr PmuTraceAttr) (int, e
 // Enable trace collection of task <pd>
 // On success, nil is returned.
 // On error, -1 is returned.
-// param pd trace collect task id
+// param taskId trace collect task id
 // return error code
 func PmuTraceEnable(taskId int) error {
 	rs := C.PmuTraceEnable(C.int(taskId))
@@ -775,7 +776,7 @@ func PmuTraceEnable(taskId int) error {
 // Disable trace collection of task <pd>
 // On success, nil is returned
 // On error, error is returned
-// param pd trace collect task id
+// param taskId trace collect task id
 // return error code
 func PmuTraceDisable(taskId int) error {
 	rs := C.PmuTraceDisable(C.int(taskId))
@@ -788,7 +789,7 @@ func PmuTraceDisable(taskId int) error {
 // Collect data.
 // Pmu trace data are collected starting from the last PmuTraceEnable or PmuTraceRead
 // On success, PmuTraceDataVo is returned
-// param pd trace collect task id
+// param taskId trace collect task id
 // param PmuTraceDataVo pmu trace data
 // return PmuTraceDataVo and error
 func PmuTraceRead(taskId int) (PmuTraceDataVo, error) {
@@ -812,7 +813,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) {
 	cDataList := *(*[]C.struct_PmuTraceData)(unsafe.Pointer(&slice))
 	goTraceData := make([]PmuTraceData, int(traceLen))
 	for i, v := range cDataList {
-		goTraceData[i] = PmuTraceData{FuncName:C.GoString(v.funcs), ElapsedTime:float64(v.elapsedTime), Pid:int(v.pid), Tid: int(v.tid), Cpu: int(v.cpu), Comm: C.GoString(v.comm)}
+		goTraceData[i] = PmuTraceData{FuncName:C.GoString(v.funcs), StartTs: int64(v.startTs), ElapsedTime:float64(v.elapsedTime), Pid:int(v.pid), Tid: int(v.tid), Cpu: int(v.cpu), Comm: C.GoString(v.comm)}
 	}
 	res.GoTraceData = goTraceData
 	res.cTraceData  = cTraceData
@@ -821,7 +822,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) {
 
 // Close task with id <pd>.
 // After PmuTraceClose is called, all pmu trace data related to the task become invalid
-// param collect task id
+// param taskId task id
 func PmuTraceClose(taskId int) {
 	C.PmuTraceClose(C.int(taskId))
 }
@@ -926,7 +927,11 @@ func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) {
 	cAttr := make([]C.struct_PmuDeviceAttr, len(attr))
 	for i, v := range attr {
 		cAttr[i].metric = v.Metric
-		cAttr[i].bdf = C.CString(v.Bdf)
+		if len(v.Bdf) > 0 {
+			cAttr[i].bdf = C.CString(v.Bdf)
+		} else {
+			cAttr[i].bdf = nil
+		}
 	}
 	deviceTaskId := C.PmuDeviceOpen(&cAttr[0], C.uint(len(attr)))
 	if int(deviceTaskId) == -1 {
@@ -947,7 +952,11 @@ func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDat
 	cAttr := make([]C.struct_PmuDeviceAttr, len(deviceAttr))
 	for i, v := range deviceAttr {
 		cAttr[i].metric = v.Metric
-		cAttr[i].bdf = C.CString(v.Bdf)
+		if len(v.Bdf) > 0 {
+			cAttr[i].bdf = C.CString(v.Bdf)
+		} else {
+			cAttr[i].bdf = nil
+		}
 	}
 	metricLen := C.int(0)
 	metricData := C.IPmuGetMetric(dataVo.cData, C.uint(len(dataVo.GoData)), &cAttr[0], C.uint(len(deviceAttr)), &metricLen)
@@ -1071,7 +1080,7 @@ func transferCPmuDataToGoData(cPmuData *C.struct_PmuData, dataLen int, fd int) [
 		goDatas[i].CountPercent = float64(dataObj.countPercent)
 		goDatas[i].Cpu = int(dataObj.cpu)
 		if dataObj.cpuTopo != nil {
-			goDatas[i].CpuTopo = CpuTopolopy{CoreId: int(dataObj.cpuTopo.coreId), NumaId: int(dataObj.cpuTopo.numaId), SocketId: int(dataObj.cpuTopo.socketId)}
+			goDatas[i].CpuTopo = CpuTopology{CoreId: int(dataObj.cpuTopo.coreId), NumaId: int(dataObj.cpuTopo.numaId), SocketId: int(dataObj.cpuTopo.socketId)}
 		}
 
 		if dataObj.ext != nil {
diff --git a/go/src/libkperf_test/libkperf_test.go b/go/src/libkperf_test/libkperf_test.go
index a0343d577c059e2bbb1c4cb0b39453a330cfd202..2f55951332b8b5224bb72e7f71fda562fdd3a405 100644
--- a/go/src/libkperf_test/libkperf_test.go
+++ b/go/src/libkperf_test/libkperf_test.go
@@ -157,7 +157,7 @@ func TestSysCallTrace(t *testing.T) {
 	t.Logf("==========================pmu get trace data success==========================")
 
 	for _, v := range traceList.GoTraceData {
-		t.Logf("comm=%v, func=%v, elapsedTime=%v, pid=%v, tid=%v, cpu=%v", v.Comm, v.FuncName, v.ElapsedTime, v.Pid, v.Tid, v.Cpu)
+		t.Logf("comm=%v, func=%v, elapsedTime=%v, startTs=%v, pid=%v, tid=%v, cpu=%v", v.Comm, v.FuncName, v.ElapsedTime, v.StartTs, v.Pid, v.Tid, v.Cpu)
 	}
 
 	kperf.PmuTraceFree(traceList)
diff --git a/pmu/pmu_list.cpp b/pmu/pmu_list.cpp
index 09a32f06fa9a9523ae4893160c7a66a880a7398b..aa38b70b468a99a4306b31cd7716847b6debae74 100644
--- a/pmu/pmu_list.cpp
+++ b/pmu/pmu_list.cpp
@@ -281,6 +281,19 @@ namespace KUNPENG_PMU {
         return userData;
     }
 
+    static void TrimKernelStack(PmuData &data)
+    {
+        auto stack = data.stack;
+        while (stack != nullptr && stack->symbol != nullptr) {
+            if (strcmp(stack->symbol->module, "[kernel]") == 0) {
+                stack = stack->next;
+                continue;
+            }
+            data.stack = stack;
+            break;
+        }
+    }
+
     void HandleBlockData(std::vector<PmuData>& pmuData, std::vector<PmuSwitchData>& switchData)
     {
         std::sort(switchData.begin(), switchData.end(), [](const PmuSwitchData& a, const PmuSwitchData& b) {
@@ -332,6 +345,9 @@ namespace KUNPENG_PMU {
                 DBG_PRINT("New tid encountered: tid=%d\n", currentTid);
             }
             if (strcmp(item.evt, "context-switches") == 0) {
+                // Convert stack from 'schedule[kernel] -> futex_wait[kernel] -> ...[kernel] -> lock_wait -> start_thread'
+                // to 'lock_wait -> start_thread', only keeping user stack.
+                TrimKernelStack(item);
                 // Before the context-switches event, there is only one cycles event, which we need to ignore. 
                 if (currentTs == 0) {
                     DBG_PRINT("Ignoring first cycles event for tid=%d\n", item.tid);