diff --git a/README.en.md b/README.en.md index efa7d54dcd2827eb86bd9379f220485a30bc27df..503138f40891afa6ed46b2bf9e2b6097ea9a926a 100644 --- a/README.en.md +++ b/README.en.md @@ -262,7 +262,7 @@ func main() { } for _, o := range dataVo.GoData { - fmt.Printf("event: %v count: %v", o.Evt, o.Count) + fmt.Printf("event: %v count: %v\n", o.Evt, o.Count) } kperf.PmuDataFree(dataVo) kperf.PmuClose(fd) diff --git a/README.md b/README.md index d511787ac9d9b59ff88b05579dffff89e359561d..ee59dc44420dffe8ec31da173d09919ef0fc8366 100644 --- a/README.md +++ b/README.md @@ -237,7 +237,7 @@ func main() { } for _, o := range dataVo.GoData { - fmt.Printf("event: %v count: %v", o.Evt, o.Count) + fmt.Printf("event: %v count: %v\n", o.Evt, o.Count) } kperf.PmuDataFree(dataVo) kperf.PmuClose(fd) diff --git a/docs/Details_Usage.md b/docs/Details_Usage.md index 70c50539446da88fdab01ad095469d65ac13c3e1..815f4d86f993ef635022ab2bd9e58e608e166ea4 100644 --- a/docs/Details_Usage.md +++ b/docs/Details_Usage.md @@ -600,162 +600,230 @@ pmu_attr = kperf.PmuAttr(evtList=evtList, includeNewFork=True) 注意,该功能是针对Counting模式,因为Sampling和SPE Sampling本身就会采集子线程的数据。 ### 采集DDRC带宽 -基于uncore事件可以计算DDRC的访存带宽,不同硬件平台有不同的计算方式。 -鲲鹏芯片上的访存带宽公式可以参考openeuler kernel的tools/perf/pmu-events/arch/arm64/hisilicon/hip09/sys/uncore-ddrc.json: -```json - { - "MetricExpr": "flux_wr * 32 / duration_time", - "BriefDescription": "Average bandwidth of DDRC memory write(Byte/s)", - "Compat": "0x00000030", - "MetricGroup": "DDRC", - "MetricName": "ddrc_bw_write", - "Unit": "hisi_sccl,ddrc" - }, - { - "MetricExpr": "flux_rd * 32 / duration_time", - "BriefDescription": "Average bandwidth of DDRC memory read(Byte/s)", - "Compat": "0x00000030", - "MetricGroup": "DDRC", - "MetricName": "ddrc_bw_read", - "Unit": "hisi_sccl,ddrc" - }, -``` - -根据公式,采集flux_wr和flux_rd事件,用于计算带宽: +鲲鹏上提供了DDRC的pmu设备,用于采集DDR的性能数据,比如带宽等。libkperf提供了API,用于获取每个numa的DDR带宽数据。 + +参考代码: ```c++ // c++代码示例 - - vector evts = { - "hisi_sccl1_ddrc/flux_rd/", - "hisi_sccl3_ddrc/flux_rd/", - "hisi_sccl5_ddrc/flux_rd/", - "hisi_sccl7_ddrc/flux_rd/", - "hisi_sccl1_ddrc/flux_wr/", - "hisi_sccl3_ddrc/flux_wr/", - "hisi_sccl5_ddrc/flux_wr/", - "hisi_sccl7_ddrc/flux_wr/" - }; // 采集hisi_scclX_ddrc设备下的flux_rd和flux_wr,具体设备名称因硬件而异,可以在/sys/devices/下查询。 - - PmuAttr attr = {0}; - attr.evtList = evts.data(); - attr.numEvt = evts.size(); - - int pd = PmuOpen(COUNTING, &attr); - if (pd == -1) { - cout << Perror() << "\n"; - return; - } +PmuDeviceAttr devAttr[2]; +// DDR读带宽 +devAttr[0].metric = PMU_DDR_READ_BW; +// DDR写带宽 +devAttr[1].metric = PMU_DDR_WRITE_BW; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 2); +// 开始采集 +PmuEnable(pd); +sleep(1); +// 读取原始信息 +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); +// 对于4个numa的服务器,devData的长度为8.前4个是读带宽,后4个是写带宽。 +for (int i=0;i<4;++i) { + // numaId表示数据对应的numa节点。 + // count是距离上次采集的DDR总读/写包长,单位是Byte, + // 需要除以时间间隔得到带宽(这里的时间间隔是1秒)。 + cout << "read bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n"; +} +for (int i=4;i<8;++i) { + cout << "write bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n"; +} +DevDataFree(devData); +PmuDataFree(oriData); +PmuDisable(pd); +``` - PmuEnable(pd); - for (int i=0;i<60;++i) { - sleep(1); - PmuData *data = nullptr; - int len = PmuRead(pd, &data); - // 有8个uncore事件,所以data的长度等于8. - // 前4个是4个numa的read带宽,后4个是4个numa的write带宽。 - for (int j=0;j<4;++j) { - printf("read bandwidth: %f M/s\n", (float)data[j].count*32/1024/1024); - } - for (int j=4;j<8;++j) { - printf("write bandwidth: %f M/s\n", (float)data[j].count*32/1024/1024); - } - PmuDataFree(data); +```python +# python代码示例 +dev_attr = [ + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_READ_BW), + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW) +] +pd = kperf.device_open(dev_attr) +kperf.enable(pd) +time.sleep(1) +kperf.disable(pd) +ori_data = kperf.read(pd) +dev_data = kperf.get_device_metric(ori_data, dev_attr) +for data in dev_data.iter: + if data.metric == kperf.PmuDeviceMetric.PMU_DDR_READ_BW: + print(f"read bandwidth({data.numaId}): {data.count/1024/1024} M/s") + if data.metric == kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW: + print(f"write bandwidth({data.numaId}): {data.count/1024/1024} M/s") +``` + +```go +// go代码用例 +deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_DDR_READ_BW}, kperf.PmuDeviceAttr{Metric: kperf.PMU_DDR_WRITE_BW}} +fd, _ := kperf.PmuDeviceOpen(deviceAttrs) +kperf.PmuEnable(fd) +time.Sleep(1 * time.Second) +kperf.PmuDisable(fd) +dataVo, _ := kperf.PmuRead(fd) +deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs) +for _, v := range deivceDataVo.GoDeviceData { + if v.Metric == kperf.PMU_DDR_READ_BW { + fmt.Printf("read bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024) + } + if v.Metric == kperf.PMU_DDR_WRITE_BW { + fmt.Printf("write bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024) } - PmuDisable(pd); - PmuClose(pd); +} +kperf.DevDataFree(deivceDataVo) +kperf.PmuDataFree(dataVo) +kperf.PmuClose(fd) +``` + +执行上述代码,输出的结果类似如下: +``` +read bandwidth(0): 17.32 M/s +read bandwidth(1): 5.43 M/s +read bandwidth(2): 2.83 M/s +read bandwidth(3): 4.09 M/s +write bandwidth(0): 4.35 M/s +write bandwidth(1): 2.29 M/s +write bandwidth(2): 0.84 M/s +write bandwidth(3): 0.97 M/s +``` + +### 采集L3 cache的时延 +libkperf提供了采集L3 cache平均时延的能力,用于分析访存型应用的性能瓶颈。 +采集是以cluster为粒度,每个cluster包含4个cpu core(如果开启了超线程则是8个),可以通过PmuGetClusterCore来获取cluster id对应的core id。 + +参考代码: +```c++ +// c++代码示例 +PmuDeviceAttr devAttr[1]; +// L3平均时延 +devAttr[0].metric = PMU_L3_LAT; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 1); +// 开始采集 +PmuEnable(pd); +sleep(1); +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData); +// devData的长度等于cluster个数 +for (int i=0;i= 4 and j < 8: - print(f"write bandwidth: {bandwidth} M/s\n") - j += 1 - kperf.disable(pd) - kperf.close(pd) +dev_attr = [ + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_LAT) +] +pd = kperf.device_open(dev_attr) +kperf.enable(pd) +time.sleep(1) +kperf.disable(pd) +ori_data = kperf.read(pd) +dev_data = kperf.get_device_metric(ori_data, dev_attr) +for data in dev_data.iter: + print(f"L3 latency({data.clusterId}): {data.count} cycles") ``` ```go // go代码用例 -import "libkperf/kperf" -import "time" -import "fmt" +deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_L3_LAT}} +fd, _ := kperf.PmuDeviceOpen(deviceAttrs) +kperf.PmuEnable(fd) +time.Sleep(1 * time.Second) +kperf.PmuDisable(fd) +dataVo, _ := kperf.PmuRead(fd) +deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs) +for _, v := range deivceDataVo.GoDeviceData { + fmt.Printf("L3 latency(%v): %v cycles\n", v.ClusterId, v.Count) +} +kperf.DevDataFree(deivceDataVo) +kperf.PmuDataFree(dataVo) +kperf.PmuClose(fd) +``` -func main() { - evtList := []string{"hisi_sccl1_ddrc/flux_rd/", - "hisi_sccl3_ddrc/flux_rd/", - "hisi_sccl5_ddrc/flux_rd/", - "hisi_sccl7_ddrc/flux_rd/", - "hisi_sccl1_ddrc/flux_wr/", - "hisi_sccl3_ddrc/flux_wr/", - "hisi_sccl5_ddrc/flux_wr/", - "hisi_sccl7_ddrc/flux_wr/"} - attr := kperf.PmuAttr{EvtList: evtList} - pd, err := kperf.PmuOpen(kperf.COUNT, attr) - if err != nil { - fmt.Printf("kperf pmuopen sample failed, expect err is nil, but is %v\n", err) - return - } - kperf.PmuEnable(pd) +执行上述代码,输出的结果类似如下: +``` +L3 latency(0): 101 cycles +L3 latency(1): 334.6 cycles +L3 latency(2): 267.8 cycles +L3 latency(3): 198.4 cycles +... +``` - for i := 0; i < 60; i++ { - time.Sleep(time.Second) - dataVo, err := kperf.PmuRead(pd) - if err != nil { - fmt.Printf("kperf pmuread failed, expect err is nil, but is %v\n", err) - } +### 采集PCIE带宽 +libkperf提供了采集PCIE带宽的能力,采集tx和rx方向的读写带宽,用于监控外部设备(nvme、gpu等)的带宽。 +并不是所有的PCIE设备都可以被采集带宽,鲲鹏的pmu设备只覆盖了一部分PCIE设备,可以通过PmuDeviceBdfList来获取当前环境可采集的PCIE设备或Root port。 - j := 0 - for _, o := range dataVo.GoData { - bandwith := o.Count * 32 / 1024 / 1024 - if j < 4 { - fmt.Printf("read bandwidth: %v M/s\n", bandwith) - } - if j >= 4 && j < 8 { - fmt.Printf("write bandwidth: %v M/s\n", bandwith) - } - j += 1 - } - } - kperf.PmuDisable(pd) - kperf.PmuClose(pd) +参考代码: +```c++ +// c++代码示例 +PmuDeviceAttr devAttr[1]; +// 采集PCIE设备RX的读带宽 +devAttr[0].metric = PMU_PCIE_RX_MRD_BW; +// 设置PCIE的bdf号 +devAttr[0].bdf = "16:04.0"; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 1); +// 开始采集 +PmuEnable(pd); +sleep(1); +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData); +// devData的长度等于pcie设备的个数 +for (int i=0;i0x400804 1 ``` -### Blocked Sample采样 +### IO和计算热点混合采样(Blocked Sample) Blocked Sample是一种新增的采样模式,该模式下会同时采集进程处于on cpu和off cpu数据,通过配置blockedSample字段去进行使能,去同时采集cycles和context-switches事件,换算off cpu的period数据。 -说明: - -1、只支持SAMPLING模式采集 - -2、只支持对进程分析,不支持对系统分析 - -使用示例: -```bash -cd example -# 运行C++用例,并分析热点 -bash run.sh all -# 运行python用例,并分析热点 -bash run.sh all python=true +详细使用方法可以参考example/pmu_hotspot.cpp +编译命令: ``` - -### Uncore事件采集能力增强 -1、支持可配置化uncore事件配置,比如如下形式进行事件配置: -```bash -smmuv3_pmcg_100020/transaction,filter_enable=1,filter_stream_id=0x7d/ +g++ -g pmu_hotspot.cpp -o pmu_hotspot -I /path/to/libkperf/include -L /path/to/libkperf/lib -lkperf -lsym ``` -2、支持采集和查询L3、DDR、SMMU、PCIE性能数据,采集如下性能数据: -- 每个core的L3带宽、hit、miss,支持920和920高性能版 -- 每个numa的L3 latency,支持920高性能版 -- 每个numa的DDR读写带宽,支持920和920高性能版 -- 指定bdf号的smmu的地址转换次数,支持920和920高性能版 -- 指定bdf号的pcie rx、tx方向的读写带宽,支持920高性能版 +对于例子: +``` +thread1: + busy_io + compute + while + write + fsync +thread2 + cpu_compute + while + compute +``` +既包含计算(compute)也包含IO(write, fsync),如果用perf采集,只能采集到on cpu的数据: +|overhead|Shared Object|Symbol| +|--------|-------------|------| +|99.94%|test_io|compute| +|0.03%|libpthread-2.17.so|__pthread_enable_asynccancel| +|0.00%|test_io|busy_io| -代码示例: -```C++ - // C++ 代码示例 - PmuDeviceAttr devAttr = {}; - devAttr.metric = PMU_L3_TRAFFIC; - int pd = PmuDeviceOpen(&devAttr, 1); +使用pmu_hotspot采集: +``` +pmu_hotspot 5 1 1 +``` - PmuEnable(pd); - sleep(1); - PmuDisable(pd); +输出结果: +|overhead|Shared Object|Symbol| +|--------|-------------|------| +|54.74%|libpthread-2.17.so|fsync| +|27.18%|test_io|compute| +采集到了fsync,得知该进程的IO占比大于计算占比。 - PmuData* oriData = nullptr; - int oriLen = PmuRead(pd, &oriData); +限制: - PmuDeviceData *devData = nullptr; - auto len = PmuGetDevMetric(oriData, oriLen, &devAttr, 1, &devData); -``` +1、只支持SAMPLING模式采集 -```python - # python 代码示例 - dev_attr = [ - kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_TRAFFIC) - ] - pd = kperf.device_open(dev_attr) - - kperf.enable(pd) - time.sleep(1) - kperf.disable(pd) - ori_data = kperf.read(pd) - - - dev_data = kperf.get_device_metric(ori_data, dev_attr) -``` \ No newline at end of file +2、只支持对进程分析,不支持对系统分析 \ No newline at end of file diff --git a/docs/Go_API.md b/docs/Go_API.md index 4db378a4affd999e87cb5c3e43fed7375bbbd027..fa1ba8f8f9dba44ddcd052317675bcaaa663cb52 100644 --- a/docs/Go_API.md +++ b/docs/Go_API.md @@ -127,7 +127,7 @@ func PmuRead(fd int) (PmuDataVo, error) * Period uint64 采样间隔 * Count uint64 计数 * CountPercent float64 计数比值,使能时间/运行时间 - * CpuTopo CpuTopolopy + * CpuTopo CpuTopology * CoreId 系统核ID * NumaId numa ID * SocketId socket ID @@ -267,6 +267,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) * FuncName string 系统调用函数名 * ElapsedTime float64 耗时时间 + * StartTs 开始时间戳 * Pid int 进程id * Tid int 线程id * Cpu int cpu号 @@ -280,7 +281,7 @@ if err != nil { } for _, v := range traceList.GoTraceData { - fmt.Printf("funcName: %v, elapsedTime: %v ms pid: %v tid: %v, cpu: %v comm: %v\n", v.FuncName, v.ElapsedTime, v.Pid, v.Tid, v.Cpu, v.Comm) + fmt.Printf("funcName: %v, elapsedTime: %v ms startTs: %v pid: %v tid: %v, cpu: %v comm: %v\n", v.FuncName, v.ElapsedTime, v.StartTs, v.Pid, v.Tid, v.Cpu, v.Comm) } ``` @@ -307,3 +308,147 @@ func main() { } } ``` + +### kperf.PmuDeviceBdfList + +func PmuDeviceBdfList(bdfType C.enum_PmuBdfType) ([]string, error) 从系统中查找所有的bdf列表 +* bdfType C.enum_PmuBdfType + * PMU_BDF_TYPE_PCIE PCIE设备对应的bdf + * PMU_BDF_TYPE_SMMU SMMU设备对应的bdf +```go +import "libkperf/kperf" +import "fmt" + +func main() { + pcieBdfList, err := kperf.PmuDeviceBdfList(kperf.PMU_BDF_TYPE_PCIE) + if err != nil { + fmt.Printf("kperf GetDeviceBdfList failed, expect err is nil, but is %v\n", err) + } + for _, v := range pcieBdfList { + fmt.Printf("bdf is %v\n", v) + } +} +``` +### kperf.PmuDeviceOpen + +func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) 初始化采集uncore事件指标的能力 + +* type PmuDeviceAttr struct: + * Metic: 指定需要采集的指标 + * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes + * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes + * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes + * PMU_L3_MISS 采集每个core的L3的miss数量,单位:count + * PMU_L3_REF 采集每个core的L3的总访问数量,单位:count + * PMU_L3_LAT 采集每个numa的L3的总时延,单位:cycles + * PMU_PCIE_RX_MRD_BW 采集pcie设备的rx方向上的读带宽,单位:Bytes/ns + * PMU_PCIE_RX_MWR_BW 采集pcie设备的rx方向上的写带宽,单位:Bytes/ns + * PMU_PCIE_TX_MRD_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns + * PMU_PCIE_TX_MWR_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns + * PMU_SMMU_TRAN 采集指定smmu设备的地址转换次数,单位:count + * Bdf: 指定需要采集设备的bdf号,只对pcie和smmu指标有效 +* 返回值是int和error,pd > 0表示初始化成功,pd == -1初始化失败,可通过kperf.error()查看错误信息,以下是一个kperf.device_open的示例 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_L3_LAT}} + fd, err := kperf.PmuDeviceOpen(deviceAttrs) + if err != nil { + fmt.Printf("kperf PmuDeviceOpen failed, expect err is nil, but is %v\n", err) + } +} +``` + +### kperf.PmuGetDevMetric + +func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDataVo, error) 对原始read接口的数据,按照device_attr中给定的指标进行数据聚合接口,返回值是PmuDeviceData + +* type PmuDataVo struct: PmuRead接口返回的原始数据 +* []PmuDeviceAttr: 指定需要聚合的指标参数 +* typ PmuDeviceDataVo struct: + * GoDeviceData []PmuDeviceData +* type PmuDeviceData struct: + * Metric C.enum_PmuDeviceMetric 采集的指标 + * Count float64 指标的计数值 + * Mode C.enum_PmuMetricMode 指标的采集类型,按core、按numa还是按bdf号 + * CoreId uint32 数据的core编号 + * NumaId uint32 数据的numa编号 + * ClusterId uint32 簇ID + * Bdf string 数据的bdf编号 + +### kperf.DevDataFree + +func DevDataFree(devVo PmuDeviceDataVo) 清理PmuDeviceData的指针数据 + +### kperf.PmuGetClusterCore + +func PmuGetClusterCore(clusterId uint) ([]uint, error) 查询指定clusterId下对应的core列表 + +* clusterId CPU的clusterId编号 +* 返回值:当前clusterId下对应的core列表,出现错误则列表为空,且error不为空 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + clusterId := uint(1) + coreList, err := kperf.PmuGetClusterCore(clusterId) + if err != nil { + fmt.Printf("kperf PmuGetClusterCore failed, expect err is nil, but is %v\n", err) + return + } + for _, v := range coreList { + fmt.Printf("coreId has:%v\n", v) + } +} +``` + +### kperf.PmuGetNumaCore + +func PmuGetNumaCore(nodeId uint) ([]uint, error) 查询指定numaId下对应的core列表 + +* nodeId numa对应的ID +* 返回值为对应numaId下的cpu core列表,出现错误则列表为空,且error不为空 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + nodeId := uint(0) + coreList, err := kperf.PmuGetNumaCore(nodeId) + if err != nil { + fmt.Printf("kperf PmuGetNumaCore failed, expect err is nil, but is %v\n", err) + return + } + for _, v := range coreList { + fmt.Printf("coreId has:%v\n", v) + } +} +``` + + +### kperf.PmuGetCpuFreq +func PmuGetCpuFreq(core uint) (int64, error) 查询当前系统指定core的实时CPU频率 + +* core cpu coreId +* 返回值为int64, 时当前cpu core的实时频率,出现错误频率为-1,且error不为空 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + coreId := uint(0) + freq, err := kperf.PmuGetCpuFreq(coreId) + if err != nil { + fmt.Printf("kperf PmuGetCpuFreq failed, expect err is nil, but is %v\n", err) + return + } + fmt.Printf("coreId %v freq is %v\n", coreId, freq) +} +``` \ No newline at end of file diff --git a/go/src/libkperf/kperf/kperf.go b/go/src/libkperf/kperf/kperf.go index cf10f715b4fa6eef5afd9458377d6b0737e8bb63..2b5958ca39adc2fd88a5f2a6e7eb43c12fd4a631 100644 --- a/go/src/libkperf/kperf/kperf.go +++ b/go/src/libkperf/kperf/kperf.go @@ -308,13 +308,13 @@ type PmuAttr struct { CallStack bool // This indicates whether to collect whole callchains or only top frame DataFilter C.enum_SpeFilter // Spe Data Filter.Refer to comments of SpeFilter EvFilter C.enum_SpeEventFilter // Spe Event filter.Refer to comments of SpeEventFilter - MinLatency uint64 // Collect only smaples with latency or higher - IncludeNewFork bool // enable it you can get the new child thread count, only in couting mode - BranchSampleFilter uint64 // if the filering mode is set, branch_sample_stack data is collected in sampling mode - BlockedSample bool // This indicates whether the blocked sample mode is enabled. In this mode, both on Cpu and off Cpu data is collectd + MinLatency uint64 // Collect only samples with latency or higher + IncludeNewFork bool // enable it you can get the new child thread count, only in counting mode + BranchSampleFilter uint64 // if the filter mode is set, branch_sample_stack data is collected in sampling mode + BlockedSample bool // This indicates whether the blocked sample mode is enabled. In this mode, both on Cpu and off Cpu data is collected } -type CpuTopolopy struct { +type CpuTopology struct { CoreId int // cpu core id NumaId int // numa id SocketId int // socket id @@ -338,12 +338,12 @@ type PmuData struct { Ts uint64 // time stamp. uint: ns Pid int // process id Tid int // thread id - Cpu int // cpu id + Cpu int // cpu id Comm string // process command Period uint64 // sample period Count uint64 // event count. Only available for counting CountPercent float64 // event count Percent. when count = 0, countPercent = -1; Only available for counting - CpuTopo CpuTopolopy // cpu topolopy + CpuTopo CpuTopology // cpu topology Symbols []sym.Symbol // symbol list BranchRecords []BranchSampleRecord // branch record list SpeExt SpeDataExt // SPE data @@ -353,7 +353,7 @@ type PmuData struct { type PmuDataVo struct { GoData []PmuData // PmuData list - cData *C.struct_PmuData // Pointer to PmuData in inferface C + cData *C.struct_PmuData // Pointer to PmuData in interface C fd int // fd } @@ -374,10 +374,11 @@ type PmuTraceAttr struct { // PmuTraceData info type PmuTraceData struct { FuncName string // function name + StartTs int64 // start timestamp. uint: us ElapsedTime float64 // elapsed time Pid int // process id Tid int // thread id - Cpu int // cpu id + Cpu int // cpu id Comm string // process command } @@ -390,7 +391,7 @@ type PmuTraceDataVo struct { type PmuDeviceAttr struct { Metric C.enum_PmuDeviceMetric - // Used for PMU_PCIE_XXX and PMU_SMMU_XXX to collect a specifi pcie device. + // Used for PMU_PCIE_XXX and PMU_SMMU_XXX to collect a specific pcie device. // The string of bdf is something like '7a:01.0'. Bdf string } @@ -403,7 +404,7 @@ type PmuDeviceData struct { Mode C.enum_PmuMetricMode // Field of union depends on the above . CoreId uint32 // for percore metric NumaId uint32 // for pernuma metric - ClusterId uint32 // for percluster emtric + ClusterId uint32 // for percluster metric Bdf string // for perpcie metric } @@ -548,7 +549,7 @@ func PmuEventList(eventType C.enum_PmuEventType) []string { // Enable counting or sampling of task . // On success, nil is returned. // On error, error is returned. -// param pd task id +// param fd task id // return error func PmuEnable(fd int) error { rs := C.PmuEnable(C.int(fd)) @@ -561,7 +562,7 @@ func PmuEnable(fd int) error { // Disable counting or sampling of task . // On success, nil is returned. // On error, error is returned. -// param pd task id +// param fd task id // return err func PmuDisable(fd int) error { rs := C.PmuDisable(C.int(fd)) @@ -613,7 +614,7 @@ func PmuDataFree(data PmuDataVo) { // Close task with id // After PmuClose is called, all pmu data related to the task become invalid -// param pd task id +// param fd task id func PmuClose(fd int) { if fd <= 0 { return @@ -627,7 +628,7 @@ func PmuClose(fd int) { } // stop a sampling task in asynchronous mode -// param pd pmu descriptor. +// param fd pmu descriptor. func PmuStop(fd int) { if fd <= 0 { return @@ -641,7 +642,7 @@ func PmuStop(fd int) { // That is to say, for COUNTING, counts of all pmu event are reset to zero in PmuRead // For SAMPLING and SPE_SAMPLING, samples collected are started from the last PmuEnable or PmuRead // On success, PmuDataVo is returned -// param pd task id +// param fd task id // return PmuDataVo and error func PmuRead(fd int) (PmuDataVo, error) { pmuDataVo := PmuDataVo{} @@ -762,7 +763,7 @@ func PmuTraceOpen(traceType C.enum_PmuTraceType, traceAttr PmuTraceAttr) (int, e // Enable trace collection of task // On success, nil is returned. // On error, -1 is returned. -// param pd trace collect task id +// param taskId trace collect task id // return error code func PmuTraceEnable(taskId int) error { rs := C.PmuTraceEnable(C.int(taskId)) @@ -775,7 +776,7 @@ func PmuTraceEnable(taskId int) error { // Disable trace collection of task // On success, nil is returned // On error, error is returned -// param pd trace collect task id +// param taskId trace collect task id // return error code func PmuTraceDisable(taskId int) error { rs := C.PmuTraceDisable(C.int(taskId)) @@ -788,7 +789,7 @@ func PmuTraceDisable(taskId int) error { // Collect data. // Pmu trace data are collected starting from the last PmuTraceEnable or PmuTraceRead // On success, PmuTraceDataVo is returned -// param pd trace collect task id +// param taskId trace collect task id // param PmuTraceDataVo pmu trace data // return PmuTraceDataVo and error func PmuTraceRead(taskId int) (PmuTraceDataVo, error) { @@ -812,7 +813,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) { cDataList := *(*[]C.struct_PmuTraceData)(unsafe.Pointer(&slice)) goTraceData := make([]PmuTraceData, int(traceLen)) for i, v := range cDataList { - goTraceData[i] = PmuTraceData{FuncName:C.GoString(v.funcs), ElapsedTime:float64(v.elapsedTime), Pid:int(v.pid), Tid: int(v.tid), Cpu: int(v.cpu), Comm: C.GoString(v.comm)} + goTraceData[i] = PmuTraceData{FuncName:C.GoString(v.funcs), StartTs: int64(v.startTs), ElapsedTime:float64(v.elapsedTime), Pid:int(v.pid), Tid: int(v.tid), Cpu: int(v.cpu), Comm: C.GoString(v.comm)} } res.GoTraceData = goTraceData res.cTraceData = cTraceData @@ -821,7 +822,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) { // Close task with id . // After PmuTraceClose is called, all pmu trace data related to the task become invalid -// param collect task id +// param taskId task id func PmuTraceClose(taskId int) { C.PmuTraceClose(C.int(taskId)) } @@ -926,7 +927,11 @@ func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) { cAttr := make([]C.struct_PmuDeviceAttr, len(attr)) for i, v := range attr { cAttr[i].metric = v.Metric - cAttr[i].bdf = C.CString(v.Bdf) + if len(v.Bdf) > 0 { + cAttr[i].bdf = C.CString(v.Bdf) + } else { + cAttr[i].bdf = nil + } } deviceTaskId := C.PmuDeviceOpen(&cAttr[0], C.uint(len(attr))) if int(deviceTaskId) == -1 { @@ -947,7 +952,11 @@ func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDat cAttr := make([]C.struct_PmuDeviceAttr, len(deviceAttr)) for i, v := range deviceAttr { cAttr[i].metric = v.Metric - cAttr[i].bdf = C.CString(v.Bdf) + if len(v.Bdf) > 0 { + cAttr[i].bdf = C.CString(v.Bdf) + } else { + cAttr[i].bdf = nil + } } metricLen := C.int(0) metricData := C.IPmuGetMetric(dataVo.cData, C.uint(len(dataVo.GoData)), &cAttr[0], C.uint(len(deviceAttr)), &metricLen) @@ -1071,7 +1080,7 @@ func transferCPmuDataToGoData(cPmuData *C.struct_PmuData, dataLen int, fd int) [ goDatas[i].CountPercent = float64(dataObj.countPercent) goDatas[i].Cpu = int(dataObj.cpu) if dataObj.cpuTopo != nil { - goDatas[i].CpuTopo = CpuTopolopy{CoreId: int(dataObj.cpuTopo.coreId), NumaId: int(dataObj.cpuTopo.numaId), SocketId: int(dataObj.cpuTopo.socketId)} + goDatas[i].CpuTopo = CpuTopology{CoreId: int(dataObj.cpuTopo.coreId), NumaId: int(dataObj.cpuTopo.numaId), SocketId: int(dataObj.cpuTopo.socketId)} } if dataObj.ext != nil { diff --git a/go/src/libkperf_test/libkperf_test.go b/go/src/libkperf_test/libkperf_test.go index a0343d577c059e2bbb1c4cb0b39453a330cfd202..2f55951332b8b5224bb72e7f71fda562fdd3a405 100644 --- a/go/src/libkperf_test/libkperf_test.go +++ b/go/src/libkperf_test/libkperf_test.go @@ -157,7 +157,7 @@ func TestSysCallTrace(t *testing.T) { t.Logf("==========================pmu get trace data success==========================") for _, v := range traceList.GoTraceData { - t.Logf("comm=%v, func=%v, elapsedTime=%v, pid=%v, tid=%v, cpu=%v", v.Comm, v.FuncName, v.ElapsedTime, v.Pid, v.Tid, v.Cpu) + t.Logf("comm=%v, func=%v, elapsedTime=%v, startTs=%v, pid=%v, tid=%v, cpu=%v", v.Comm, v.FuncName, v.ElapsedTime, v.StartTs, v.Pid, v.Tid, v.Cpu) } kperf.PmuTraceFree(traceList) diff --git a/pmu/pmu_list.cpp b/pmu/pmu_list.cpp index 09a32f06fa9a9523ae4893160c7a66a880a7398b..aa38b70b468a99a4306b31cd7716847b6debae74 100644 --- a/pmu/pmu_list.cpp +++ b/pmu/pmu_list.cpp @@ -281,6 +281,19 @@ namespace KUNPENG_PMU { return userData; } + static void TrimKernelStack(PmuData &data) + { + auto stack = data.stack; + while (stack != nullptr && stack->symbol != nullptr) { + if (strcmp(stack->symbol->module, "[kernel]") == 0) { + stack = stack->next; + continue; + } + data.stack = stack; + break; + } + } + void HandleBlockData(std::vector& pmuData, std::vector& switchData) { std::sort(switchData.begin(), switchData.end(), [](const PmuSwitchData& a, const PmuSwitchData& b) { @@ -332,6 +345,9 @@ namespace KUNPENG_PMU { DBG_PRINT("New tid encountered: tid=%d\n", currentTid); } if (strcmp(item.evt, "context-switches") == 0) { + // Convert stack from 'schedule[kernel] -> futex_wait[kernel] -> ...[kernel] -> lock_wait -> start_thread' + // to 'lock_wait -> start_thread', only keeping user stack. + TrimKernelStack(item); // Before the context-switches event, there is only one cycles event, which we need to ignore. if (currentTs == 0) { DBG_PRINT("Ignoring first cycles event for tid=%d\n", item.tid);