diff --git a/README.en.md b/README.en.md index f6a888ef93667577b9dcf85ec198b5cb122d725d..2940a2b6d515e4ae9327c0cd56b924c7c09b86e2 100644 --- a/README.en.md +++ b/README.en.md @@ -56,7 +56,7 @@ Minimum required GCC version: Minimum required Python version: -- python-3.7. +- python-3.6. To build a library with C API: diff --git a/README.md b/README.md index 0d5c764b2a2efdf6a42120c24bdd9b35530889f2..5c8ba57a241d01ac721dec13702722e0e716454a 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ v1.0: - gcc-4.8.5 和 glibc-2.17 最低依赖python版本: -- python-3.7 +- python-3.6 编译生成动态库和C的API: ```shell diff --git a/docs/Details_Usage.md b/docs/Details_Usage.md index 289c03c1bf30bb5038456c1140c043d0fb0392d3..84e1136552990969e095815871d6133242049f6f 100644 --- a/docs/Details_Usage.md +++ b/docs/Details_Usage.md @@ -41,7 +41,7 @@ func main() { attr := kperf.PmuAttr{EvtList:[]string{"cycles", "branch-misses"}} pd, err := kperf.PmuOpen(kperf.COUNT, attr) if err != nil { - fmt.Printf("kperf pmuopen couting failed, expect err is nil, but is %v", err) + fmt.Printf("kperf pmuopen counting failed, expect err is nil, but is %v", err) return } } @@ -331,7 +331,7 @@ func main() { attr := kperf.PmuAttr{EvtList:evtList} pd, err := kperf.PmuOpen(kperf.COUNT, attr) if err != nil { - fmt.Printf("kperf pmuopen couting failed, expect err is nil, but is %v\n", err) + fmt.Printf("kperf pmuopen counting failed, expect err is nil, but is %v\n", err) return } } @@ -600,7 +600,7 @@ pmu_attr = kperf.PmuAttr(evtList=evtList, includeNewFork=True) 注意,该功能是针对Counting模式,因为Sampling和SPE Sampling本身就会采集子线程的数据。 ### 采集DDRC带宽 -鲲鹏上提供了DDRC的pmu设备,用于采集DDR的性能数据,比如带宽等。libkperf提供了API,用于获取每个numa的DDR带宽数据。 +鲲鹏上提供了DDRC的pmu设备,用于采集DDR的性能数据,比如带宽等。libkperf提供了API,用于获取每个channel的DDR带宽数据。 参考代码: ```c++ @@ -620,15 +620,17 @@ PmuData *oriData = nullptr; int oriLen = PmuRead(pd, &oriData); PmuDeviceData *devData = nullptr; auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); -// 对于4个numa的服务器,devData的长度为8.前4个是读带宽,后4个是写带宽。 -for (int i=0;i<4;++i) { - // numaId表示数据对应的numa节点。 +// devData的长度为2 * n (总通道数)。前n个是读带宽,后n个是写带宽。 +for (int i = 0; i < len / 2; ++i) { + // socketId表示数据对应的socket节点。 + // ddrNumaId表示数据对应的numa节点。 + // channelID表示数据对应的通道ID。 // count是距离上次采集的DDR总读/写包长,单位是Byte, // 需要除以时间间隔得到带宽(这里的时间间隔是1秒)。 - cout << "read bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n"; + cout << "read bandwidth(Socket: " << devData[i].socketId << " Numa: " << devData[i].ddrNumaId << " Channel: " << devData[i].channelId << "): " << devData[i].count/1024/1024 << "M/s\n"; } -for (int i=4;i<8;++i) { - cout << "write bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n"; +for (int i = len / 2; i < len; ++i) { + cout << "write bandwidth(Socket: " << devData[i].socketId << " Numa: " << devData[i].ddrNumaId << " Channel: " << devData[i].channelId << "): " << devData[i].count/1024/1024 << "M/s\n"; } DevDataFree(devData); PmuDataFree(oriData); @@ -649,9 +651,9 @@ ori_data = kperf.read(pd) dev_data = kperf.get_device_metric(ori_data, dev_attr) for data in dev_data.iter: if data.metric == kperf.PmuDeviceMetric.PMU_DDR_READ_BW: - print(f"read bandwidth({data.numaId}): {data.count/1024/1024} M/s") + print(f"read bandwidth(Socket: {data.socketId} Numa: {data.ddrNumaId} Channel: {data.channelId}): {data.count/1024/1024} M/s") if data.metric == kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW: - print(f"write bandwidth({data.numaId}): {data.count/1024/1024} M/s") + print(f"write bandwidth(Socket: {data.socketId} Numa: {data.ddrNumaId} Channel: {data.channelId}): {data.count/1024/1024} M/s") ``` ```go @@ -665,10 +667,10 @@ dataVo, _ := kperf.PmuRead(fd) deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs) for _, v := range deivceDataVo.GoDeviceData { if v.Metric == kperf.PMU_DDR_READ_BW { - fmt.Printf("read bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024) + fmt.Printf("read bandwidth(Socket: %v Numa: %v Channel: %v): %v M/s\n", v.SocketId, v.DdrNumaId, v.ChannelId, v.Count/1024/1024) } if v.Metric == kperf.PMU_DDR_WRITE_BW { - fmt.Printf("write bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024) + fmt.Printf("write bandwidth(Socket: %v Numa: %v Channel: %v): %v M/s\n", v.SocketId, v.DdrNumaId, v.ChannelId, v.Count/1024/1024) } } kperf.DevDataFree(deivceDataVo) @@ -678,14 +680,23 @@ kperf.PmuClose(fd) 执行上述代码,输出的结果类似如下: ``` -read bandwidth(0): 17.32 M/s -read bandwidth(1): 5.43 M/s -read bandwidth(2): 2.83 M/s -read bandwidth(3): 4.09 M/s -write bandwidth(0): 4.35 M/s -write bandwidth(1): 2.29 M/s -write bandwidth(2): 0.84 M/s -write bandwidth(3): 0.97 M/s +read bandwidth(Socket: 0 Numa: 0 Channel: 0): 6.08 M/s +read bandwidth(Socket: 0 Numa: 0 Channel: 1): 5.66 M/s +read bandwidth(Socket: 0 Numa: 0 Channel: 2): 6.23 M/s +read bandwidth(Socket: 0 Numa: 0 Channel: 3): 5.30 M/s +read bandwidth(Socket: 0 Numa: 1 Channel: 4): 4.21 M/s +read bandwidth(Socket: 0 Numa: 1 Channel: 5): 4.06 M/s +read bandwidth(Socket: 0 Numa: 1 Channel: 6): 3.99 M/s +read bandwidth(Socket: 0 Numa: 1 Channel: 7): 3.89 M/s +... +write bandwidth(Socket: 1 Numa: 2 Channel: 1): 1.49 M/s +write bandwidth(Socket: 1 Numa: 2 Channel: 2): 1.44 M/s +write bandwidth(Socket: 1 Numa: 2 Channel: 3): 1.39 M/s +write bandwidth(Socket: 1 Numa: 2 Channel: 4): 1.22 M/s +write bandwidth(Socket: 1 Numa: 3 Channel: 4): 1.44 M/s +write bandwidth(Socket: 1 Numa: 3 Channel: 5): 1.43 M/s +write bandwidth(Socket: 1 Numa: 3 Channel: 6): 1.40 M/s +write bandwidth(Socket: 1 Numa: 3 Channel: 7): 1.38 M/s ``` ### 采集L3 cache的时延 @@ -826,6 +837,102 @@ kperf.PmuClose(fd) pcie bw(16:04.0): 124122412 Bytes/ns ``` +### 采集跨numa/跨socket访问HHA比例 +libkperf提供了采集跨numa/跨socket访问HHA的操作比例的能力,用于分析访存型应用的性能瓶颈,采集以numa为粒度。 + +参考代码: +```c++ +// c++代码示例 +#include +#include "symbol.h" +#include "pmu.h" + +PmuDeviceAttr devAttr[2]; +// 采集跨numa访问HHA的操作比例 +devAttr[0].metric = PMU_HHA_CROSS_NUMA; +// 采集跨socket访问HHA的操作比例 +devAttr[1].metric = PMU_HHA_CROSS_SOCKET; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 2); +// 开始采集 +PmuEnable(pd); +sleep(1); +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); +// devData的长度等于设备numa的个数 +for (int i = 0; i < len / 2; ++i) { + cout << "HHA cross-numa operations ratio (Numa: " << devData[i].numaId << "): " << devData[i].count<< "\n"; +} +for (int i = len / 2; i < len; ++i) { + cout << "HHA cross-socket operations ratio (Numa: " << devData[i].numaId << "): " << devData[i].count<< "\n"; +} +DevDataFree(devData); +PmuDataFree(oriData); +PmuDisable(pd); +``` + +```python +# python代码示例 +import kperf +import time + +dev_attr = [ + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_HHA_CROSS_NUMA), + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_HHA_CROSS_SOCKET) +] +pd = kperf.device_open(dev_attr) +kperf.enable(pd) +time.sleep(1) +kperf.disable(pd) +ori_data = kperf.read(pd) +dev_data = kperf.get_device_metric(ori_data, dev_attr) +for data in dev_data.iter: + if data.metric == kperf.PmuDeviceMetric.PMU_HHA_CROSS_NUMA: + print(f"HHA cross-numa operations ratio (Numa: {data.numaId}): {data.count}") + if data.metric == kperf.PmuDeviceMetric.PMU_HHA_CROSS_SOCKET: + print(f"HHA cross-socket operations ratio (Numa: {data.numaId}): {data.count}") +``` + +```go +// go代码用例 +import "libkperf/kperf" +import "fmt" +import "time" + +deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_HHA_CROSS_NUMA}, kperf.PmuDeviceAttr{Metric: kperf.PMU_HHA_CROSS_SOCKET}} +fd, _ := kperf.PmuDeviceOpen(deviceAttrs) +kperf.PmuEnable(fd) +time.Sleep(1 * time.Second) +kperf.PmuDisable(fd) +dataVo, _ := kperf.PmuRead(fd) +deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs) +for _, v := range deivceDataVo.GoDeviceData { + if v.Metric == kperf.PMU_HHA_CROSS_NUMA { + fmt.Printf("HHA cross-numa operations ratio (Numa: %v): %v\n", v.NumaId, v.Count) + } + if v.Metric == kperf.PMU_HHA_CROSS_SOCKET { + fmt.Printf("HHA cross-socket operations ratio (Numa: %v): %v\n", v.NumaId, v.Count) + } +} +kperf.DevDataFree(deivceDataVo) +kperf.PmuDataFree(dataVo) +kperf.PmuClose(fd) +``` + +执行上述代码,输出的结果类似如下: +``` +HHA cross-numa operations ratio (Numa: 0): 0.438888 +HHA cross-numa operations ratio (Numa: 1): 0.0248052 +HHA cross-numa operations ratio (Numa: 2): 0.0277224 +HHA cross-numa operations ratio (Numa: 3): 0.181404 +HHA cross-socket operations ratio (Numa: 0): 0.999437 +HHA cross-socket operations ratio (Numa: 1): 0.0253748 +HHA cross-socket operations ratio (Numa: 2): 0.329864 +HHA cross-socket operations ratio (Numa: 3): 0.18956 +``` + ### 采集系统调用函数耗时信息 libkperf基于tracepoint事件采集能力,在原有能力的基础上,重新封装了一组相关的调用API,来提供采集系统调用函数耗时信息的能力,类似于perf trace命令 diff --git a/docs/Go_API.md b/docs/Go_API.md index fa1ba8f8f9dba44ddcd052317675bcaaa663cb52..9e636d471a9dcaf59f325f2c34476e846505200b 100644 --- a/docs/Go_API.md +++ b/docs/Go_API.md @@ -334,9 +334,9 @@ func main() { func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) 初始化采集uncore事件指标的能力 * type PmuDeviceAttr struct: - * Metic: 指定需要采集的指标 - * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes - * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes + * Metric: 指定需要采集的指标 + * PMU_DDR_READ_BW 采集每个channel的ddrc的读带宽,单位:Bytes + * PMU_DDR_WRITE_BW 采集每个channel的ddrc的写带宽,单位:Bytes * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes * PMU_L3_MISS 采集每个core的L3的miss数量,单位:count * PMU_L3_REF 采集每个core的L3的总访问数量,单位:count @@ -346,6 +346,8 @@ func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) 初始化采集uncore事 * PMU_PCIE_TX_MRD_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns * PMU_PCIE_TX_MWR_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns * PMU_SMMU_TRAN 采集指定smmu设备的地址转换次数,单位:count + * PMU_HHA_CROSS_NUMA 采集每个numa的跨numa访问HHA的操作比例 + * PMU_HHA_CROSS_SOCKET 采集每个numa的跨socket访问HHA的操作比例 * Bdf: 指定需要采集设备的bdf号,只对pcie和smmu指标有效 * 返回值是int和error,pd > 0表示初始化成功,pd == -1初始化失败,可通过kperf.error()查看错误信息,以下是一个kperf.device_open的示例 @@ -370,14 +372,20 @@ func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDat * []PmuDeviceAttr: 指定需要聚合的指标参数 * typ PmuDeviceDataVo struct: * GoDeviceData []PmuDeviceData +* type DdrDataStructure struct { + ChannelId uint32 ddr数据的channel编号 + DdrNumaId uint32 ddr数据的numa编号 + SocketId uint32 ddr数据的socket编号 + } * type PmuDeviceData struct: * Metric C.enum_PmuDeviceMetric 采集的指标 * Count float64 指标的计数值 - * Mode C.enum_PmuMetricMode 指标的采集类型,按core、按numa还是按bdf号 + * Mode C.enum_PmuMetricMode 指标的采集类型,按core、按numa、按channel还是按bdf号 * CoreId uint32 数据的core编号 * NumaId uint32 数据的numa编号 * ClusterId uint32 簇ID * Bdf string 数据的bdf编号 + * DdrDataStructure ddr相关的统计数据 ### kperf.DevDataFree diff --git a/docs/Python_API.md b/docs/Python_API.md index 1ed876665a87a80e462d46b60bea7a22c171f5de..365057890ad558ec488506f43518cd0082e840f3 100644 --- a/docs/Python_API.md +++ b/docs/Python_API.md @@ -5,7 +5,7 @@ kperf.open(collector_type: kperf.PmuTaskType, pmu_attr: kperf.PmuAttr) * class PmuTaskType - * COUTING PMU计数模式 + * COUNTING PMU计数模式 * SAMPLING PMU采样模式 * SPE_SAMPLING SPE采样模式 * class PmuAttr @@ -51,7 +51,7 @@ kperf.open(collector_type: kperf.PmuTaskType, pmu_attr: kperf.PmuAttr) * SPE_EVENT_MISPREDICTED = 0x80 # mispredict * minLatency 仅收集该latency或者更高的样本数据 * includeNewFork - 是否支持子线程拆分,仅在COUTING模式中支持 + 是否支持子线程拆分,仅在COUNTING模式中支持 * branchSampleFilter * KPERF_NO_BRANCH_SAMPLE = 0 不采集branch sample stack数据 * KPERF_SAMPLE_BRANCH_USER = 1 << 0 分支目标位于用户空间 @@ -329,18 +329,20 @@ for func_name in kperf.sys_call_func_list(): kperf.device_open(dev_attr: List[PmuDeviceAttr]) 初始化采集uncore事件指标的能力 * class PmuDeviceAttr: - * metic: 指定需要采集的指标 - * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes - * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes + * metric: 指定需要采集的指标 + * PMU_DDR_READ_BW 采集每个channel的ddrc的读带宽,单位:Bytes + * PMU_DDR_WRITE_BW 采集每个channel的ddrc的写带宽,单位:Bytes * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes * PMU_L3_MISS 采集每个core的L3的miss数量,单位:count * PMU_L3_REF 采集每个core的L3的总访问数量,单位:count - * PMU_L3_LAT 采集每个numa的L3的总时延,单位:cycles + * PMU_L3_LAT 采集每个cluster的L3的总时延,单位:cycles * PMU_PCIE_RX_MRD_BW 采集pcie设备的rx方向上的读带宽,单位:Bytes/ns * PMU_PCIE_RX_MWR_BW 采集pcie设备的rx方向上的写带宽,单位:Bytes/ns * PMU_PCIE_TX_MRD_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns * PMU_PCIE_TX_MWR_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns * PMU_SMMU_TRAN 采集指定smmu设备的地址转换次数,单位:count + * PMU_HHA_CROSS_NUMA 采集每个numa的跨numa访问HHA的操作比例 + * PMU_HHA_CROSS_SOCKET 采集每个numa的跨socket访问HHA的操作比例 * bdf: 指定需要采集设备的bdf号,只对pcie和smmu指标有效 * 返回值是int类型,pd > 0表示初始化成功,pd == -1初始化失败,可通过kperf.error()查看错误信息,以下是一个kperf.device_open的示例 @@ -365,14 +367,19 @@ kperf.get_device_metric(pmu_data: PmuData, device_attr: List[PmuDeviceAttr]) 对 * len: 数据长度 * iter: 返回iterator[ImplPmuDeviceData] * free: 释放当前PmuDeviceData +* class DdrDataStructure: + * channelId: ddr数据的channel编号 + * ddrNumaId: ddr数据的numa编号 + * socketId: ddr数据的socket编号 * class ImplPmuDeviceData: * metric: 采集的指标 * count:指标的计数值 - * mode: 指标的采集类型,按core、按numa还是按bdf号 + * mode: 指标的采集类型,按core、按numa、按channel还是按bdf号 * union: * coreId: 数据的core编号 * numaId: 数据的numa编号 * bdf: 数据的bdf编号 + * DdrDataStructure: ddr相关的统计数据 ### kperf.device_bdf_list diff --git a/go/src/libkperf/kperf/kperf.go b/go/src/libkperf/kperf/kperf.go index 2b5958ca39adc2fd88a5f2a6e7eb43c12fd4a631..972dbe4976dcd068522795fa2fd4ef266ef4d3d1 100644 --- a/go/src/libkperf/kperf/kperf.go +++ b/go/src/libkperf/kperf/kperf.go @@ -37,6 +37,9 @@ struct MetricDataExt { unsigned coreId; unsigned clusterId; char* bdf; + unsigned channelId; + unsigned ddrNumaId; + unsigned socketId; }; void SetPeriod(struct PmuAttr* attr, unsigned period) { @@ -122,6 +125,11 @@ void IPmuGetMetricDataExt(struct PmuDeviceData* deviceData, struct MetricDataExt case PMU_METRIC_CLUSTER: metricData->clusterId = deviceData->clusterId; break; + case PMU_METRIC_CHANNEL: + metricData->channelId = deviceData->channelId; + metricData->ddrNumaId = deviceData->ddrNumaId; + metricData->socketId = deviceData->socketId; + break; } } @@ -237,12 +245,12 @@ var ( // PmuDeviceMetric var ( - // Pernuma metric. - // Collect ddr read bandwidth for each numa node. + // Perchannel metric. + // Collect ddr read bandwidth for each channel. // Unit: Bytes/s PMU_DDR_READ_BW C.enum_PmuDeviceMetric = C.PMU_DDR_READ_BW - // Pernuma metric. - // Collect ddr write bandwidth for each numa node. + // Perchannel metric. + // Collect ddr write bandwidth for each channel. // Unit: Bytes/s PMU_DDR_WRITE_BW C.enum_PmuDeviceMetric = C.PMU_DDR_WRITE_BW // Percore metric. @@ -257,8 +265,8 @@ var ( // Collect L3 total reference count, including miss and hit count. // Unit: count PMU_L3_REF C.enum_PmuDeviceMetric = C.PMU_L3_REF - // Pernuma metric. - // Collect L3 total latency for each numa node. + // Percluster metric. + // Collect L3 total latency for each cluster node. // Unit: cycles PMU_L3_LAT C.enum_PmuDeviceMetric = C.PMU_L3_LAT // Collect pcie rx bandwidth. @@ -276,6 +284,12 @@ var ( // Collect smmu address transaction. // Unit: count PMU_SMMU_TRAN C.enum_PmuDeviceMetric = C.PMU_SMMU_TRAN + // Pernuma metric. + // Collect rate of cross-numa operations received by HHA. + PMU_HHA_CROSS_NUMA C.enum_PmuDeviceMetric = C.PMU_HHA_CROSS_NUMA + // Pernuma metric. + // Collect rate of cross-socket operations received by HHA. + PMU_HHA_CROSS_SOCKET C.enum_PmuDeviceMetric = C.PMU_HHA_CROSS_SOCKET ) // PmuBdfType @@ -291,6 +305,7 @@ var ( PMU_METRIC_NUMA C.enum_PmuMetricMode = C.PMU_METRIC_NUMA PMU_METRIC_CLUSTER C.enum_PmuMetricMode = C.PMU_METRIC_CLUSTER PMU_METRIC_BDF C.enum_PmuMetricMode = C.PMU_METRIC_BDF + PMU_METRIC_CHANNEL C.enum_PmuMetricMode = C.PMU_METRIC_CHANNEL ) var fdModeMap map[int]C.enum_PmuTaskType = make(map[int]C.enum_PmuTaskType) @@ -396,6 +411,12 @@ type PmuDeviceAttr struct { Bdf string } +type DdrDataStructure struct { + ChannelId uint32 + DdrNumaId uint32 + SocketId uint32 +} + type PmuDeviceData struct { Metric C.enum_PmuDeviceMetric // The metric value. The meaning of value depends on metric type. @@ -406,6 +427,7 @@ type PmuDeviceData struct { NumaId uint32 // for pernuma metric ClusterId uint32 // for percluster metric Bdf string // for perpcie metric + DdrDataStructure // for perchannel metric } type PmuDeviceDataVo struct { @@ -983,6 +1005,9 @@ func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDat goDeviceList[i].NumaId = uint32(metricDataExt.numaId) goDeviceList[i].ClusterId = uint32(metricDataExt.clusterId) goDeviceList[i].Bdf = C.GoString(metricDataExt.bdf) + goDeviceList[i].ChannelId = uint32(metricDataExt.channelId) + goDeviceList[i].DdrNumaId = uint32(metricDataExt.ddrNumaId) + goDeviceList[i].SocketId = uint32(metricDataExt.socketId) } res.GoDeviceData = goDeviceList res.cDeviceData = metricData diff --git a/go/src/libkperf_test/libkperf_test.go b/go/src/libkperf_test/libkperf_test.go index 2f55951332b8b5224bb72e7f71fda562fdd3a405..e64ea6a1687dff432c07ae8cad59b60e031200c9 100644 --- a/go/src/libkperf_test/libkperf_test.go +++ b/go/src/libkperf_test/libkperf_test.go @@ -22,7 +22,7 @@ func TestCount(t *testing.T) { } for _, o := range dataVo.GoData { - t.Logf("================================Get Couting data success================================") + t.Logf("================================Get Counting data success================================") t.Logf("count base info comm=%v, evt=%v, pid=%v, tid=%v, coreId=%v, numaId=%v, sockedId=%v", o.Comm, o.Evt, o.Pid, o.Tid, o.CpuTopo.CoreId, o.CpuTopo.NumaId, o.CpuTopo.SocketId) t.Logf("count info count=%v, countPercent=%v", o.Count, o.CountPercent) } diff --git a/include/pmu.h b/include/pmu.h index af9bb2aeb92f8c3403122c86c3dd581c61f5bd6e..ae28aa925ec13423ec98bfb62ec775344a5f9523 100644 --- a/include/pmu.h +++ b/include/pmu.h @@ -404,12 +404,12 @@ int PmuGetField(struct SampleRawData *rawData, const char *fieldName, void *valu struct SampleRawField *PmuGetFieldExp(struct SampleRawData *rawData, const char *fieldName); enum PmuDeviceMetric { - // Pernuma metric. - // Collect ddr read bandwidth for each numa node. + // Perchannel metric. + // Collect ddr read bandwidth for each channel. // Unit: Bytes PMU_DDR_READ_BW, - // Pernuma metric. - // Collect ddr write bandwidth for each numa node. + // Perchannel metric. + // Collect ddr write bandwidth for each channel. // Unit: Bytes PMU_DDR_WRITE_BW, // Percore metric. @@ -442,7 +442,13 @@ enum PmuDeviceMetric { // Perpcie metric. // Collect smmu address transaction. // Unit: count - PMU_SMMU_TRAN + PMU_SMMU_TRAN, + // Pernuma metric. + // Collect rate of cross-numa operations received by HHA. + PMU_HHA_CROSS_NUMA, + // Pernuma metric. + // Collect rate of cross-socket operations received by HHA. + PMU_HHA_CROSS_SOCKET }; struct PmuDeviceAttr { @@ -463,7 +469,8 @@ enum PmuMetricMode { PMU_METRIC_CORE, PMU_METRIC_NUMA, PMU_METRIC_CLUSTER, - PMU_METRIC_BDF + PMU_METRIC_BDF, + PMU_METRIC_CHANNEL }; /** @@ -502,6 +509,12 @@ struct PmuDeviceData { unsigned clusterId; // for perpcie metric char *bdf; + // for perchannel metric of ddr + struct { + unsigned channelId; + unsigned ddrNumaId; + unsigned socketId; + }; }; }; diff --git a/pmu/pmu_metric.cpp b/pmu/pmu_metric.cpp index a2163a1dd1493e9937e3d7f9468ceb6cd309b3f8..957c30b093def1c6cb83787c84692d2af9b851e8 100644 --- a/pmu/pmu_metric.cpp +++ b/pmu/pmu_metric.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,7 @@ using namespace std; using namespace pcerr; +using IdxMap = unordered_map>; static unsigned maxCpuNum = 0; static vector coreArray; @@ -82,12 +84,15 @@ namespace KUNPENG_PMU { {PmuDeviceMetric::PMU_PCIE_RX_MWR_BW, "PMU_PCIE_RX_MWR_BW"}, {PmuDeviceMetric::PMU_PCIE_TX_MRD_BW, "PMU_PCIE_TX_MRD_BW"}, {PmuDeviceMetric::PMU_PCIE_TX_MWR_BW, "PMU_PCIE_TX_MWR_BW"}, - {PmuDeviceMetric::PMU_SMMU_TRAN, "PMU_SMMU_TRAN"} + {PmuDeviceMetric::PMU_SMMU_TRAN, "PMU_SMMU_TRAN"}, + {PmuDeviceMetric::PMU_HHA_CROSS_NUMA, "PMU_HHA_CROSS_NUMA"}, + {PmuDeviceMetric::PMU_HHA_CROSS_SOCKET, "PMU_HHA_CROSS_SOCKET"}, }; set percoreMetric = {PMU_L3_TRAFFIC, PMU_L3_MISS, PMU_L3_REF}; - set pernumaMetric = {PMU_DDR_READ_BW, PMU_DDR_WRITE_BW, PMU_L3_LAT}; + set pernumaMetric = {PMU_HHA_CROSS_NUMA, PMU_HHA_CROSS_SOCKET}; set perClusterMetric = {PMU_L3_LAT}; + set perChannelMetric = {PMU_DDR_READ_BW, PMU_DDR_WRITE_BW}; set perpcieMetric = {PMU_PCIE_RX_MRD_BW, PMU_PCIE_RX_MWR_BW, PMU_PCIE_TX_MRD_BW, @@ -105,7 +110,7 @@ namespace KUNPENG_PMU { if (it != MetricToString.end()) { return it->second; } - return ""; + return ""; } using PMU_METRIC_PAIR = std::pair; @@ -266,6 +271,30 @@ namespace KUNPENG_PMU { 2 } }; + + PMU_METRIC_PAIR HHA_CROSS_NUMA = { + PmuDeviceMetric::PMU_HHA_CROSS_NUMA, + { + "hisi_sccl", + "hha", + {"0x0", "0x02"}, + "", + "", + 0 + } + }; + + PMU_METRIC_PAIR HHA_CROSS_SOCKET = { + PmuDeviceMetric::PMU_HHA_CROSS_SOCKET, + { + "hisi_sccl", + "hha", + {"0x0", "0x01"}, + "", + "", + 0 + } + }; } static const map HIP_A_UNCORE_METRIC_MAP { @@ -275,6 +304,8 @@ namespace KUNPENG_PMU { METRIC_CONFIG::L3_MISS, METRIC_CONFIG::L3_REF, METRIC_CONFIG::SMMU_TRAN, + METRIC_CONFIG::HHA_CROSS_NUMA, + METRIC_CONFIG::HHA_CROSS_SOCKET, }; static const map HIP_B_UNCORE_METRIC_MAP { @@ -289,6 +320,8 @@ namespace KUNPENG_PMU { METRIC_CONFIG::PCIE_TX_MRD_BW, METRIC_CONFIG::PCIE_TX_MWR_BW, METRIC_CONFIG::SMMU_TRAN, + METRIC_CONFIG::HHA_CROSS_NUMA, + METRIC_CONFIG::HHA_CROSS_SOCKET, }; const UNCORE_METRIC_MAP UNCORE_METRIC_CONFIG_MAP = { @@ -300,7 +333,7 @@ namespace KUNPENG_PMU { { CHIP_TYPE chipType = GetCpuType(); if (UNCORE_METRIC_CONFIG_MAP.find(chipType) == UNCORE_METRIC_CONFIG_MAP.end()) { - return {}; + return {}; } return UNCORE_METRIC_CONFIG_MAP.at(chipType); } @@ -849,7 +882,7 @@ namespace KUNPENG_PMU { } // remove duplicate device attribute - static int RemoveDupDeviceAttr(struct PmuDeviceAttr *attr, unsigned len, std::vector& deviceAttr, bool l3ReDup) + static int RemoveDupDeviceAttr(struct PmuDeviceAttr *attr, unsigned len, std::vector& deviceAttr) { std::unordered_set uniqueSet; for (int i = 0; i < len; ++i) { @@ -861,17 +894,6 @@ namespace KUNPENG_PMU { } if (uniqueSet.find(key) == uniqueSet.end()) { - // when in deviceopen remove the same PMU_L3_TRAFFIC and PMU_L3_REF, - // but when getDevMetric we need to keep them. - if (l3ReDup == true && - (attr[i].metric == PmuDeviceMetric::PMU_L3_TRAFFIC || attr[i].metric == PmuDeviceMetric::PMU_L3_REF)) { - if (uniqueSet.find(std::to_string(PmuDeviceMetric::PMU_L3_TRAFFIC)) != uniqueSet.end()) { - continue; - } - if (uniqueSet.find(std::to_string(PmuDeviceMetric::PMU_L3_REF)) != uniqueSet.end()) { - continue; - } - } uniqueSet.insert(key); deviceAttr.emplace_back(attr[i]); } @@ -888,6 +910,11 @@ namespace KUNPENG_PMU { unsigned numaId; unsigned clusterId; char *bdf; + struct { + unsigned channelId; + unsigned ddrNumaId; + unsigned socketId; + }; }; }; @@ -936,7 +963,7 @@ namespace KUNPENG_PMU { switch(metric) { case PMU_DDR_READ_BW: case PMU_DDR_WRITE_BW: - return PMU_METRIC_NUMA; + return PMU_METRIC_CHANNEL; case PMU_L3_LAT: return PMU_METRIC_CLUSTER; case PMU_L3_TRAFFIC: @@ -949,6 +976,9 @@ namespace KUNPENG_PMU { case PMU_PCIE_TX_MWR_BW: case PMU_SMMU_TRAN: return PMU_METRIC_BDF; + case PMU_HHA_CROSS_NUMA: + case PMU_HHA_CROSS_SOCKET: + return PMU_METRIC_NUMA; } return PMU_METRIC_INVALID; } @@ -969,25 +999,57 @@ namespace KUNPENG_PMU { int AggregateByNuma(const PmuDeviceMetric metric, const vector &rawData, vector &devData) { - map devDataByNuma; + const auto& deviceConfig = GetDeviceMtricConfig(); + const auto& findConfig = deviceConfig.find(metric); + if (findConfig == deviceConfig.end()) { + return SUCCESS; + } + auto &evts = findConfig->second.events; + if (evts.size() != 2) { + return SUCCESS; + } + // Event name for total access count. + string totalEvt = evts[0]; + // Event name for cross-numa/cross-socket count. + string crossEvt = evts[1]; + // Sort data by numa, and then sort by event string. + map> devDataByNuma; for (auto &data : rawData) { + string devName; + string evtName; + if (!GetDeviceName(data.evtName, devName, evtName)) { + continue; + } + auto evtConfig = ExtractEvtStr("config", evtName); auto findData = devDataByNuma.find(data.numaId); if (findData == devDataByNuma.end()) { - PmuDeviceData outData; - outData.metric = data.metric; - outData.count = data.count; - outData.mode = GetMetricMode(data.metric); - outData.numaId = data.numaId; - devDataByNuma[data.numaId] = outData; + devDataByNuma[data.numaId][evtConfig] = data; } else { - findData->second.count += data.count; + devDataByNuma[data.numaId][evtConfig].count += data.count; } } for (auto &data : devDataByNuma) { - devData.push_back(data.second); + // Get events of cross-numa/cross-socket access count and total access count. + auto findcrossData = data.second.find(crossEvt); + auto findtotalData = data.second.find(totalEvt); + if (findcrossData == data.second.end() || findtotalData == data.second.end()) { + continue; + } + // Compute ratio: cross access count / total access count + double ratio = 0.0; + if (findtotalData->second.count != 0) { + ratio = (double)(findcrossData->second.count) / findtotalData->second.count; + } else { + ratio = -1; + } + PmuDeviceData outData; + outData.metric = metric; + outData.count = ratio; + outData.mode = GetMetricMode(metric); + outData.numaId = data.first; + devData.push_back(outData); } - return SUCCESS; } @@ -1064,6 +1126,108 @@ namespace KUNPENG_PMU { return SUCCESS; } + static IdxMap DDRC_CHANNEL_MAP_HIPA = { + {1, {{0, 0}, {1, 1}, {2, 2}, {3, 3}}}, + {3, {{0, 4}, {1, 5}, {2, 6}, {3, 7}}}, + {5, {{0, 0}, {1, 1}, {2, 2}, {3, 3}}}, + {7, {{0, 4}, {1, 5}, {2, 6}, {3, 7}}}, + }; + static IdxMap DDRC_CHANNEL_MAP_HIPB = { + {3, {{0, 0}, {2, 1}, {3, 2}, {5, 3}}}, + {1, {{0, 4}, {2, 5}, {3, 6}, {5, 7}}}, + {11, {{0, 0}, {2, 1}, {3, 2}, {5, 3}}}, + {9, {{0, 4}, {2, 5}, {3, 6}, {5, 7}}}, + }; + + static unordered_map DDRC_CHANNEL_MAP = { + {HIPA, DDRC_CHANNEL_MAP_HIPA}, + {HIPB, DDRC_CHANNEL_MAP_HIPB}, + }; + + static int ParseDDRIdx(const string &devName, const string prefix) + { + size_t ddrcPos = devName.find(prefix); + size_t channelIndex = ddrcPos + prefix.length(); + string ddrcIndexStr = devName.substr(channelIndex); + size_t separatorPos = ddrcIndexStr.find("_"); + int ddrcIndex = separatorPos != string::npos ? stoi(ddrcIndexStr.substr(0, separatorPos)) : stoi(ddrcIndexStr); + return ddrcIndex; + } + + static bool getChannelId(const char *evt, const unsigned ddrNumaId, unsigned &channelId) + { + string devName; + string evtName; + if (!GetDeviceName(evt, devName, evtName)) { + return false; + } + // ddrc channel index. eg: hisi_sccl3_ddrc3_1 --> 3_1 + int ddrcIndex = ParseDDRIdx(devName, "ddrc"); + int scclIndex = ParseDDRIdx(devName, "sccl"); + + CHIP_TYPE chipType = GetCpuType(); //get channel index + if (DDRC_CHANNEL_MAP.find(chipType) == DDRC_CHANNEL_MAP.end()) { + return false; + } + + auto &ddrcChannelList = DDRC_CHANNEL_MAP[chipType]; + auto ddrIdxMap = ddrcChannelList.find(scclIndex); + if (ddrIdxMap != ddrcChannelList.end()) { + auto channelIdx = ddrIdxMap->second.find(ddrcIndex); + if (channelIdx != ddrIdxMap->second.end()) { + channelId = channelIdx->second; + return true; + } + } + return false; + } + + struct channelKeyHash { + size_t operator()(const tuple& key) const { + auto socketIdHash = hash{}(get<0>(key)); + auto channelIdHash = hash{}(get<1>(key)); + auto ddrNumaIdHash = hash{}(get<2>(key)); + return socketIdHash ^ (channelIdHash << 1) ^ (ddrNumaIdHash << 2); + } + }; + + int AggregateByChannel(const PmuDeviceMetric metric, const vector &rawData, vector &devData) + { + unordered_map, PmuDeviceData, channelKeyHash> devDataByChannel; //Key: socketId, channelId, ddrNumaId + for (auto &data : rawData) { + unsigned channelId; + if (!getChannelId(data.evtName, data.ddrNumaId, channelId)) { + continue; + } + auto ddrDatakey = make_tuple(data.socketId, channelId, data.ddrNumaId); + auto findData = devDataByChannel.find(ddrDatakey); + if (findData == devDataByChannel.end()) { + PmuDeviceData outData; + outData.metric = data.metric; + outData.count = data.count; + outData.mode = GetMetricMode(data.metric); + outData.channelId = channelId; + outData.ddrNumaId = data.ddrNumaId; + outData.socketId = data.socketId; + devDataByChannel[ddrDatakey] = outData; + } else { + findData->second.count += data.count; + } + } + + vector, PmuDeviceData>> sortedVec(devDataByChannel.begin(), devDataByChannel.end()); + sort(sortedVec.begin(), sortedVec.end(), []( + const pair, PmuDeviceData>& a, + const pair, PmuDeviceData>& b) { + return a.first < b.first; + }); + for (auto &data : sortedVec) { + devData.push_back(data.second); + } + + return SUCCESS; + } + int PcieBWAggregate(const PmuDeviceMetric metric, const vector &rawData, vector &devData) { const auto& deviceConfig = GetDeviceMtricConfig(); @@ -1146,14 +1310,16 @@ namespace KUNPENG_PMU { {PMU_DDR_WRITE_BW, DDRBw}, {PMU_L3_TRAFFIC, L3Bw}}; map aggregateMap = { - {PMU_DDR_READ_BW, AggregateByNuma}, - {PMU_DDR_WRITE_BW, AggregateByNuma}, + {PMU_DDR_READ_BW, AggregateByChannel}, + {PMU_DDR_WRITE_BW, AggregateByChannel}, {PMU_L3_LAT, AggregateByCluster}, {PMU_PCIE_RX_MRD_BW, PcieBWAggregate}, {PMU_PCIE_RX_MWR_BW, PcieBWAggregate}, {PMU_PCIE_TX_MRD_BW, PcieBWAggregate}, {PMU_PCIE_TX_MWR_BW, PcieBWAggregate}, {PMU_SMMU_TRAN, SmmuTransAggregate}, + {PMU_HHA_CROSS_NUMA, AggregateByNuma}, + {PMU_HHA_CROSS_SOCKET, AggregateByNuma}, }; static bool IsMetricEvent(const string &devName, const string &evtName, const PmuDeviceAttr &devAttr) @@ -1256,6 +1422,10 @@ namespace KUNPENG_PMU { if (perClusterMetric.find(devAttr.metric) != perClusterMetric.end()) { devData.clusterId = pmuData[i].cpuTopo->coreId / clusterWidth; } + if (perChannelMetric.find(devAttr.metric) != perChannelMetric.end()) { + devData.ddrNumaId = pmuData[i].cpuTopo->numaId; + devData.socketId = pmuData[i].cpuTopo->socketId; + } if (IsBdfMetric(devAttr.metric)) { devData.bdf = devAttr.bdf; } @@ -1331,7 +1501,7 @@ int PmuDeviceOpen(struct PmuDeviceAttr *attr, unsigned len) } // Remove duplicate device attributes. vector deviceAttr; - if (RemoveDupDeviceAttr(attr, len, deviceAttr, true) != SUCCESS) { + if (RemoveDupDeviceAttr(attr, len, deviceAttr) != SUCCESS) { return -1; } vector configEvtList; @@ -1343,8 +1513,17 @@ int PmuDeviceOpen(struct PmuDeviceAttr *attr, unsigned len) configEvtList.insert(configEvtList.end(), temp.begin(), temp.end()); } - vector evts; + //remove the same event of PMU_L3_TRAFFIC and PMU_L3_REF, PMU_HHA_CROSS_NUMA and PMU_HHA_CROSS_SOCKET + unordered_set tmpEvents; + vector filteredEvtList; for (auto& evt : configEvtList) { + if (tmpEvents.find(evt) == tmpEvents.end()) { + tmpEvents.insert(evt); + filteredEvtList.push_back(evt); + } + } + vector evts; + for (auto& evt : filteredEvtList) { evts.push_back(const_cast(evt.c_str())); } @@ -1391,7 +1570,7 @@ int PmuGetDevMetric(struct PmuData *pmuData, unsigned len, } // Remove duplicate device attributes. vector deviceAttr; - if (RemoveDupDeviceAttr(attr, attrLen, deviceAttr, false) != SUCCESS) { + if (RemoveDupDeviceAttr(attr, attrLen, deviceAttr) != SUCCESS) { return -1; } // Filter pmuData by metric and generate InnerDeviceData, diff --git a/python/modules/_libkperf/Pmu.py b/python/modules/_libkperf/Pmu.py index 74f2c9d9b1853322321cf7af64f43c73baa9d776..350cc9f97147abfe679099f493dd54e983db6c84 100644 --- a/python/modules/_libkperf/Pmu.py +++ b/python/modules/_libkperf/Pmu.py @@ -467,6 +467,12 @@ class PmuDeviceAttr: pmu_device_attr.__c_pmu_device_attr = c_pmu_device_attr return pmu_device_attr +class DdrDataStructure(ctypes.Structure): + _fields_ = [ + ('channelId', ctypes.c_uint), + ('ddrNumaId', ctypes.c_uint), + ('socketId', ctypes.c_uint) + ] class CtypesPmuDeviceData(ctypes.Structure): """ @@ -479,6 +485,11 @@ class CtypesPmuDeviceData(ctypes.Structure): unsigned numaId; unsigned clusterId; char *bdf; + struct { + unsigned channelId; + unsigned ddrNumaId; + unsigned socketId; + }; }; }; """ @@ -487,7 +498,8 @@ class CtypesPmuDeviceData(ctypes.Structure): ('coreId', ctypes.c_uint), ('numaId', ctypes.c_uint), ('clusterId', ctypes.c_uint), - ('bdf', ctypes.c_char_p) + ('bdf', ctypes.c_char_p), + ('_structure', DdrDataStructure) ] _fields_ = [ @@ -521,6 +533,23 @@ class CtypesPmuDeviceData(ctypes.Structure): return self._union.bdf.decode(UTF_8) return "" + @property + def channelId(self) -> int: + if self.mode == 5 and self._union._structure.channelId: # PMU_METRIC_CHANNEL + return self._union._structure.channelId + return 0 + + @property + def ddrNumaId(self) -> int: + if self.mode == 5 and self._union._structure.ddrNumaId: # PMU_METRIC_CHANNEL + return self._union._structure.ddrNumaId + return 0 + + @property + def socketId(self) -> int: + if self.mode == 5 and self._union._structure.socketId: # PMU_METRIC_CHANNEL + return self._union._structure.socketId + return 0 class ImplPmuDeviceData: __slots__ = ['__c_pmu_device_data'] @@ -574,6 +603,24 @@ class ImplPmuDeviceData: return self.c_pmu_device_data._union.bdf.decode(UTF_8) return "" + @property + def channelId(self) -> int: + if self.mode == 5 and self.c_pmu_device_data._union._structure.channelId: # PMU_METRIC_CHANNEL + return self.c_pmu_device_data._union._structure.channelId + return 0 + + @property + def ddrNumaId(self) -> int: + if self.mode == 5 and self.c_pmu_device_data._union._structure.ddrNumaId: # PMU_METRIC_CHANNEL + return self.c_pmu_device_data._union._structure.ddrNumaId + return 0 + + @property + def socketId(self) -> int: + if self.mode == 5 and self.c_pmu_device_data._union._structure.socketId: # PMU_METRIC_CHANNEL + return self.c_pmu_device_data._union._structure.socketId + return 0 + @classmethod def from_c_pmu_device_data(cls, c_pmu_device_data: CtypesPmuDeviceData) -> 'ImplPmuDeviceData': pmu_device_data = cls() diff --git a/python/modules/kperf/pmu.py b/python/modules/kperf/pmu.py index 975b1221fdf8565c2c34b717a2f201a8728abe1f..bd791caa8f4f003f102dbd44bb18edaa94e72ec7 100644 --- a/python/modules/kperf/pmu.py +++ b/python/modules/kperf/pmu.py @@ -107,12 +107,12 @@ class SymbolMode: RESOLVE_ELF_DWARF = 2 # Resolve elf and dwarf. All fields in Symbol will be valid. class PmuDeviceMetric: - # Pernuma metric. - # Collect ddr read bandwidth for each numa node. + # Perchannel metric. + # Collect ddr read bandwidth for each channel. # Unit: Bytes/s PMU_DDR_READ_BW = 0 - # Pernuma metric. - # Collect ddr write bandwidth for each numa node. + # Perchannel metric. + # Collect ddr write bandwidth for each channel. # Unit: Bytes/s PMU_DDR_WRITE_BW = 1 # Percore metric. @@ -127,8 +127,8 @@ class PmuDeviceMetric: # Collect L3 total reference count, including miss and hit count. # Unit: count PMU_L3_REF = 4 - # Pernuma metric. - # Collect L3 total latency for each numa node. + # Percluster metric. + # Collect L3 total latency for each cluster node. # Unit: cycles PMU_L3_LAT = 5 # Collect pcie rx bandwidth. @@ -146,6 +146,12 @@ class PmuDeviceMetric: # Collect smmu address transaction. # Unit: count PMU_SMMU_TRAN = 10 + # Pernuma metric. + # Collect rate of cross-numa operations received by HHA. + PMU_HHA_CROSS_NUMA = 11 + # Pernuma metric. + # Collect rate of cross-socket operations received by HHA. + PMU_HHA_CROSS_SOCKET = 12 class PmuDeviceAttr(_libkperf.PmuDeviceAttr): """ @@ -173,6 +179,7 @@ class PmuMetricMode: PMU_METRIC_NUMA = 2 PMU_METRIC_CLUSTER = 3 PMU_METRIC_BDF = 4 + PMU_METRIC_CHANNEL = 5 class ImplPmuDeviceData(_libkperf.ImplPmuDeviceData): pass @@ -193,6 +200,12 @@ class PmuDeviceData(_libkperf.PmuDeviceData): unsigned numaId; // for perpcie metric char *bdf; + // for perchannel metric of ddr + struct { + unsigned channelId; + unsigned ddrNumaId; + unsigned socketId; + }; }; }; """ diff --git a/python/tests/test_metric.py b/python/tests/test_metric.py index 4a878aed0ac5166d60ab52e47a2f2d05a26b001a..90c254be317d202ad78bd4a0523b726996d61d92 100644 --- a/python/tests/test_metric.py +++ b/python/tests/test_metric.py @@ -112,7 +112,8 @@ def test_get_numa_cores(): def test_collect_ddr_bandwidth(): dev_attr = [ - kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_READ_BW) + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_READ_BW), + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW) ] pd = kperf.device_open(dev_attr) print(kperf.error()) @@ -125,9 +126,10 @@ def test_collect_ddr_bandwidth(): dev_data = None dev_data = kperf.get_device_metric(ori_data, dev_attr) - assert len(dev_data) == 4 - assert dev_data[0].numaId == 0 - assert dev_data[0].mode == kperf.PmuMetricMode.PMU_METRIC_NUMA + assert dev_data[0].count != 0 + assert dev_data[0].metric == kperf.PmuDeviceMetric.PMU_DDR_READ_BW + assert dev_data[0].mode == kperf.PmuMetricMode.PMU_METRIC_CHANNEL + assert dev_data[len(dev_data) - 1].metric == kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW print_dev_data_details(dev_data) kperf.close(pd) @@ -150,26 +152,6 @@ def test_collect_l3_latency(): print_dev_data_details(dev_data) kperf.close(pd) -def test_collect_l3_latency_and_ddr(): - dev_attr = [ - kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_LAT), - kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW) - ] - pd = kperf.device_open(dev_attr) - print(kperf.error()) - assert pd != -1, f"Expected non-negative pd, but got {pd}" - kperf.enable(pd) - time.sleep(1) - kperf.disable(pd) - ori_data = kperf.read(pd) - assert len(ori_data) != -1, f"Expected non-negative ori_len, but got {len(ori_data)}" - - dev_data = kperf.get_device_metric(ori_data, dev_attr) - assert len(dev_data) == get_cluster_nums() + 4 - print_dev_data_details(dev_data) - kperf.close(pd) - - def test_collect_l3_traffic(): dev_attr = [ kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_TRAFFIC) @@ -274,6 +256,28 @@ def test_get_metric_smmu_transaction(): print_dev_data_details(dev_data) kperf.close(pd) +def test_collect_hha_cross(): + dev_attr = [ + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_HHA_CROSS_SOCKET), + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_HHA_CROSS_NUMA) + ] + pd = kperf.device_open(dev_attr) + print(kperf.error()) + assert pd != -1, f"Expected non-negative pd, but got {pd}" + kperf.enable(pd) + time.sleep(1) + kperf.disable(pd) + ori_data = kperf.read(pd) + assert len(ori_data) != -1, f"Expected non-negative ori_len, but got {len(ori_data)}" + + dev_data = kperf.get_device_metric(ori_data, dev_attr) + assert dev_data[0].metric == kperf.PmuDeviceMetric.PMU_HHA_CROSS_SOCKET + assert dev_data[0].mode == kperf.PmuMetricMode.PMU_METRIC_NUMA + assert dev_data[-1].metric == kperf.PmuDeviceMetric.PMU_HHA_CROSS_NUMA + assert dev_data[-1].mode == kperf.PmuMetricMode.PMU_METRIC_NUMA + print_dev_data_details(dev_data) + kperf.close(pd) + if __name__ == '__main__': # 提示用户使用pytest 运行测试文件 print("This is a pytest script. Run it using the 'pytest' command.") diff --git a/test/test_perf/test_metric.cpp b/test/test_perf/test_metric.cpp index 56bee10674152030a55ab56c04f59fd6c1ce8aa1..68710cb5a9815368bf0151ac0712fccc3f45b43e 100644 --- a/test/test_perf/test_metric.cpp +++ b/test/test_perf/test_metric.cpp @@ -104,9 +104,10 @@ TEST_F(TestMetric, GetNumaIdList) TEST_F(TestMetric, CollectDDRBandwidth) { - PmuDeviceAttr devAttr = {}; - devAttr.metric = PMU_DDR_READ_BW; - int pd = PmuDeviceOpen(&devAttr, 1); + PmuDeviceAttr devAttr[2] = {}; + devAttr[0].metric = PMU_DDR_READ_BW; + devAttr[1].metric = PMU_DDR_WRITE_BW; + int pd = PmuDeviceOpen(devAttr, 2); cout << Perror() << endl; ASSERT_NE(pd, -1); PmuEnable(pd); @@ -117,16 +118,11 @@ TEST_F(TestMetric, CollectDDRBandwidth) ASSERT_NE(oriLen, -1); PmuDeviceData *devData = nullptr; - auto len = PmuGetDevMetric(oriData, oriLen, &devAttr, 1, &devData); - ASSERT_EQ(len, 4); - ASSERT_EQ(devData[0].numaId, 0); - ASSERT_EQ(devData[0].mode, PMU_METRIC_NUMA); - ASSERT_EQ(devData[1].numaId, 1); - ASSERT_EQ(devData[1].mode, PMU_METRIC_NUMA); - ASSERT_EQ(devData[2].numaId, 2); - ASSERT_EQ(devData[2].mode, PMU_METRIC_NUMA); - ASSERT_EQ(devData[3].numaId, 3); - ASSERT_EQ(devData[3].mode, PMU_METRIC_NUMA); + auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); + ASSERT_NE(devData[0].count, 0); + ASSERT_EQ(devData[0].mode, PMU_METRIC_CHANNEL); + ASSERT_EQ(devData[0].metric, PMU_DDR_READ_BW); + ASSERT_EQ(devData[len - 1].metric, PMU_DDR_WRITE_BW); DevDataFree(devData); PmuDataFree(oriData); PmuClose(pd); @@ -160,37 +156,6 @@ TEST_F(TestMetric, CollectL3Latency) PmuClose(pd); } -TEST_F(TestMetric, CollectL3LatencyAndDDR) -{ - PmuDeviceAttr devAttr[2] = {}; - devAttr[0].metric = PMU_L3_LAT; - devAttr[1].metric = PMU_DDR_WRITE_BW; - - int pd = PmuDeviceOpen(devAttr, 2); - cout << Perror() << endl; - ASSERT_NE(pd, -1); - PmuEnable(pd); - sleep(1); - PmuDisable(pd); - PmuData* oriData = nullptr; - int oriLen = PmuRead(pd, &oriData); - ASSERT_NE(oriLen, -1); - - PmuDeviceData *devData = nullptr; - auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); - unsigned clusterCount = GetClusterCount(); - unsigned numaCount = GetNumaNodeCount(); - ASSERT_EQ(len, clusterCount + numaCount); - ASSERT_NE(devData[0].count, 0); - ASSERT_EQ(devData[0].metric, PMU_L3_LAT); - ASSERT_EQ(devData[0].mode, PMU_METRIC_CLUSTER); - ASSERT_EQ(devData[clusterCount].metric, PMU_DDR_WRITE_BW); - ASSERT_EQ(devData[clusterCount].mode, PMU_METRIC_NUMA); - DevDataFree(devData); - PmuDataFree(oriData); - PmuClose(pd); -} - TEST_F(TestMetric, CollectL3Traffic) { PmuDeviceAttr devAttr = {}; @@ -344,4 +309,29 @@ TEST_F(TestMetric, GetMetricSmmuTransaction) DevDataFree(devData); PmuDataFree(oriData); PmuClose(pd); +} + +TEST_F(TestMetric, GetMetricHHACross) +{ + PmuDeviceAttr devAttr[2] = {}; + devAttr[0].metric = PMU_HHA_CROSS_NUMA; + devAttr[1].metric = PMU_HHA_CROSS_SOCKET; + int pd = PmuDeviceOpen(devAttr, 2); + ASSERT_NE(pd, -1); + PmuEnable(pd); + sleep(1); + PmuDisable(pd); + PmuData* oriData = nullptr; + int oriLen = PmuRead(pd, &oriData); + ASSERT_NE(oriLen, -1); + + PmuDeviceData *devData = nullptr; + auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); + ASSERT_EQ(devData[0].metric, PMU_HHA_CROSS_NUMA); + ASSERT_EQ(devData[0].mode, PMU_METRIC_NUMA); + ASSERT_EQ(devData[len - 1].metric, PMU_HHA_CROSS_SOCKET); + ASSERT_EQ(devData[len - 1].mode, PMU_METRIC_NUMA); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } \ No newline at end of file diff --git a/util/common.h b/util/common.h index 4ffd0c8e4cef5f193a60733916afcae753a66175..8932295a0a38e412fc1b71d3c6039ec83ad4ba4b 100644 --- a/util/common.h +++ b/util/common.h @@ -18,6 +18,7 @@ #include #include #include +#include const std::string TRACE_EVENT_PATH = "/sys/kernel/tracing/events/"; const std::string TRACE_DEBUG_EVENT_PATH = "/sys/kernel/debug/tracing/events/";