From 93036c0ec010aebe47a4d25f50161bebcae42fbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Tue, 15 Apr 2025 18:37:17 +0800 Subject: [PATCH 01/14] =?UTF-8?q?=E4=BF=AE=E6=94=B9python=E6=9E=84?= =?UTF-8?q?=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/modules/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/modules/CMakeLists.txt b/python/modules/CMakeLists.txt index 8f898fc..40b53a1 100644 --- a/python/modules/CMakeLists.txt +++ b/python/modules/CMakeLists.txt @@ -22,7 +22,7 @@ configure_file( ) add_custom_target(${PROJECT_NAME} ALL - COMMAND ${PYTHON_EXECUTABLE} setup.py install --user + COMMAND ${PYTHON_EXECUTABLE} setup.py install WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) -- Gitee From 5c9dc0cf932848b65b75692c909f1c3dbc8cd9d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Wed, 16 Apr 2025 17:57:51 +0800 Subject: [PATCH 02/14] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=B3=BB=E7=BB=9F?= =?UTF-8?q?=E8=B0=83=E7=94=A8=E5=87=BD=E6=95=B0=E8=80=97=E6=97=B6=E7=9A=84?= =?UTF-8?q?=E8=B5=B7=E5=A7=8B=E6=97=B6=E9=97=B4=E6=88=B3=E7=9A=84=E4=BF=A1?= =?UTF-8?q?=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/Python_API.md | 1 + include/pmu.h | 1 + pmu/pmu_analysis.cpp | 1 + python/modules/_libkperf/Pmu.py | 14 ++++++++++++++ python/tests/test_trace_analysis.py | 4 ++-- test/test_perf/test_trace_analysis.cpp | 5 +++++ 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/Python_API.md b/docs/Python_API.md index 3363dfd..8772787 100644 --- a/docs/Python_API.md +++ b/docs/Python_API.md @@ -295,6 +295,7 @@ pd为kperf.trace_open返回值 * free: 释放当前PmuTraceData数据 * class lmplPmuTraceData: * funcs: 系统调用函数名 + * startTs: 耗时起始时刻 * elapsedTime: 耗时时间 * pid: 进程id * tid: 线程id diff --git a/include/pmu.h b/include/pmu.h index 4ea5f04..d8c43ad 100644 --- a/include/pmu.h +++ b/include/pmu.h @@ -257,6 +257,7 @@ struct PmuData { struct PmuTraceData { const char *funcs; // system call function + int64_t startTs; // start time stamp. unit: ns double elapsedTime; // elapsed time pid_t pid; // process id int tid; // thread id diff --git a/pmu/pmu_analysis.cpp b/pmu/pmu_analysis.cpp index b1446ec..37d8cf9 100644 --- a/pmu/pmu_analysis.cpp +++ b/pmu/pmu_analysis.cpp @@ -141,6 +141,7 @@ namespace KUNPENG_PMU { PmuTraceData traceDataItem = {0}; traceDataItem.funcs = funName; double nsToMsUnit = 1000000.0; + traceDataItem.startTs = enterPmuData.ts; traceDataItem.elapsedTime = (double)(exitPmuData.ts - enterPmuData.ts) / nsToMsUnit; // convert to ms traceDataItem.pid = enterPmuData.pid; traceDataItem.tid = enterPmuData.tid; diff --git a/python/modules/_libkperf/Pmu.py b/python/modules/_libkperf/Pmu.py index 48939fb..5c381e0 100644 --- a/python/modules/_libkperf/Pmu.py +++ b/python/modules/_libkperf/Pmu.py @@ -1307,6 +1307,7 @@ class CtypesPmuTraceData(ctypes.Structure): """ struct PmuTraceData { const char *funcs; // system call function + uint64_t startTs; // start time stamp. unit: ns double elapsedTime; // elapsed time pid_t pid; // process id int tid; // thread id @@ -1316,6 +1317,7 @@ class CtypesPmuTraceData(ctypes.Structure): """ _fields_ = [ ('funcs', ctypes.c_char_p), + ('startTs', ctypes.c_int64), ('elapsedTime', ctypes.c_double), ('pid', ctypes.c_int), ('tid', ctypes.c_int), @@ -1325,6 +1327,7 @@ class CtypesPmuTraceData(ctypes.Structure): def __init__(self, funcs: str = '', + startTs: int = 0, elapsedTime: float = 0.0, pid: int = 0, tid: int = 0, @@ -1334,6 +1337,7 @@ class CtypesPmuTraceData(ctypes.Structure): super().__init__(*args, **kw) self.funcs = ctypes.c_char_p(funcs.encode(UTF_8)) + self.startTs = ctypes.c_int64(startTs) self.elapsedTime = ctypes.c_double(elapsedTime) self.pid = ctypes.c_int(pid) self.tid = ctypes.c_int(tid) @@ -1344,6 +1348,7 @@ class ImplPmuTraceData: __slots__ = ['__c_pmu_trace_data'] def __init__(self, funcs: str = '', + startTs: int = 0, elapsedTime: float = 0.0, pid: int = 0, tid: int = 0, @@ -1352,6 +1357,7 @@ class ImplPmuTraceData: *args: Any, **kw: Any) -> None: self.__c_pmu_trace_data = CtypesPmuTraceData( funcs=funcs, + startTs=startTs, elapsedTime=elapsedTime, pid=pid, tid=tid, @@ -1370,6 +1376,14 @@ class ImplPmuTraceData: @funcs.setter def funcs(self, funcs: str) -> None: self.__c_pmu_trace_data.funcs = ctypes.c_char_p(funcs.encode(UTF_8)) + + @property + def startTs(self) -> int: + return self.__c_pmu_trace_data.startTs + + @startTs.setter + def startTs(self, startTs: int) -> None: + self.__c_pmu_trace_data.startTs = ctypes.c_int64(startTs) @property def elapsedTime(self) -> float: diff --git a/python/tests/test_trace_analysis.py b/python/tests/test_trace_analysis.py index 6243993..7e6e155 100644 --- a/python/tests/test_trace_analysis.py +++ b/python/tests/test_trace_analysis.py @@ -74,7 +74,7 @@ def test_collect_single_trace_data(run_test_exe, setup_trace): pmu_trace_data = kperf.trace_read(pd) for data in pmu_trace_data.iter: - print(f"funcName: {data.funcs} elapsedTime: {data.elapsedTime} pid: {data.pid} tid: {data.tid} cpu: {data.cpu} comm: {data.comm}") + print(f"funcName: {data.funcs} startTs: {data.startTs} elapsedTime: {data.elapsedTime} pid: {data.pid} tid: {data.tid} cpu: {data.cpu} comm: {data.comm}") # Assert that at least one trace record is captured assert pmu_trace_data.iter, "No trace data was captured" @@ -93,7 +93,7 @@ def test_collect_all_syscall_trace_data(setup_trace): pmu_trace_data = kperf.trace_read(pd) for data in pmu_trace_data.iter: - print(f"funcName: {data.funcs} elapsedTime: {data.elapsedTime} pid: {data.pid} tid: {data.tid} cpu: {data.cpu} comm: {data.comm}") + print(f"funcName: {data.funcs} startTs: {data.startTs} elapsedTime: {data.elapsedTime} pid: {data.pid} tid: {data.tid} cpu: {data.cpu} comm: {data.comm}") # Assert that at least one trace record is captured assert pmu_trace_data.iter, "No trace data was captured" diff --git a/test/test_perf/test_trace_analysis.cpp b/test/test_perf/test_trace_analysis.cpp index 2182b77..7062d69 100644 --- a/test/test_perf/test_trace_analysis.cpp +++ b/test/test_perf/test_trace_analysis.cpp @@ -93,6 +93,11 @@ TEST_F(TestAnaylzeData, collect_single_trace_data_success) { EnableTracePointer(pd, 1); int len = PmuTraceRead(pd, &data); EXPECT_TRUE(data != nullptr); + for (int i = 0; i < len; i++) { + cout << "funcName: " << data[i].funcs << " startTs: " << data[i].startTs << " elapsedTime: " << data[i].elapsedTime + << " pid: " << data[i].pid << " tid: " << data[i].tid << " cpu: " << data[i].cpu + << " comm: " << data[i].comm << endl; + } } /** -- Gitee From 9018ab1e10b2a8e38b5114b6e0d189a1eaaaa231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Thu, 17 Apr 2025 14:45:27 +0800 Subject: [PATCH 03/14] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E9=94=81=E4=BF=9D=E6=8A=A4=EF=BC=8C=E9=98=B2=E6=AD=A2=E5=A4=9A?= =?UTF-8?q?=E7=BA=BF=E7=A8=8B=E5=9C=BA=E6=99=AF=E4=B8=8B=E5=87=BA=E7=8E=B0?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=B7=B7=E4=B9=B1,=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E4=B8=80=E4=BA=9B=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E7=9A=84?= =?UTF-8?q?=E5=86=99=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Contributing.md | 2 +- docs/Python_API.md | 4 +-- include/pmu.h | 4 +-- pmu/pmu.cpp | 2 ++ pmu/pmu_metric.cpp | 61 +++++++++++++++++++++++++-------- pmu/pmu_metric.h | 23 +++++++++++++ python/modules/_libkperf/Pmu.py | 26 +++++++++----- python/modules/kperf/pmu.py | 2 +- python/tests/test_metric.py | 34 +++++++++++------- test/test_perf/test_metric.cpp | 48 ++++++++++++++++++++------ 10 files changed, 155 insertions(+), 51 deletions(-) create mode 100644 pmu/pmu_metric.h diff --git a/Contributing.md b/Contributing.md index daaeca1..9551a89 100644 --- a/Contributing.md +++ b/Contributing.md @@ -7,7 +7,7 @@ ### 开发者相关 - libkperf需要保持较高的兼容性,建议使用gcc 4来编译工程,请基于C++11来开发。 -- 如果要编译调试版,可以用编译命令```bash build.sh buildType=debug```. 在调试时,可以设置环境变量PERF_DEBUG=1,用于打印调试信息。 +- 如果要编译调试版,可以用编译命令```bash build.sh build_type=debug```. 在调试时,可以设置环境变量PERF_DEBUG=1,用于打印调试信息。 - 如果要编译并运行UT用例,可以用编译命令```bash build.sh test=True```. UT用例中需要执行SPE采样,如果环境不支持SPE,那么这些用例会失败。鲲鹏上配置SPE的方法参考:https://www.hikunpeng.com/document/detail/zh/kunpengdevps/userguide/usermanual/kunpengoper_06_0010.html - 提交PR时,请描述问题、原因、方法,以便后续跟踪问题和特性。请在提交前通过UT用例测试。 diff --git a/docs/Python_API.md b/docs/Python_API.md index 8772787..1ed8766 100644 --- a/docs/Python_API.md +++ b/docs/Python_API.md @@ -330,8 +330,8 @@ for func_name in kperf.sys_call_func_list(): kperf.device_open(dev_attr: List[PmuDeviceAttr]) 初始化采集uncore事件指标的能力 * class PmuDeviceAttr: * metic: 指定需要采集的指标 - * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes/s - * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes/s + * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes + * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes * PMU_L3_MISS 采集每个core的L3的miss数量,单位:count * PMU_L3_REF 采集每个core的L3的总访问数量,单位:count diff --git a/include/pmu.h b/include/pmu.h index d8c43ad..af9bb2a 100644 --- a/include/pmu.h +++ b/include/pmu.h @@ -406,11 +406,11 @@ struct SampleRawField *PmuGetFieldExp(struct SampleRawData *rawData, const char enum PmuDeviceMetric { // Pernuma metric. // Collect ddr read bandwidth for each numa node. - // Unit: Bytes/s + // Unit: Bytes PMU_DDR_READ_BW, // Pernuma metric. // Collect ddr write bandwidth for each numa node. - // Unit: Bytes/s + // Unit: Bytes PMU_DDR_WRITE_BW, // Percore metric. // Collect L3 access bytes for each cpu core. diff --git a/pmu/pmu.cpp b/pmu/pmu.cpp index 64fb10a..44414ca 100644 --- a/pmu/pmu.cpp +++ b/pmu/pmu.cpp @@ -26,6 +26,7 @@ #include "linked_list.h" #include "pcerr.h" #include "safe_handler.h" +#include "pmu_metric.h" #include "trace_pointer_parser.h" #include "pmu.h" @@ -772,6 +773,7 @@ void PmuClose(int pd) } try { KUNPENG_PMU::PmuList::GetInstance()->Close(pd); + PmuDeviceBdfListFree(); New(SUCCESS); } catch (std::bad_alloc&) { New(COMMON_ERR_NOMEM); diff --git a/pmu/pmu_metric.cpp b/pmu/pmu_metric.cpp index 4537656..6dffb3b 100644 --- a/pmu/pmu_metric.cpp +++ b/pmu/pmu_metric.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include "common.h" #include "uncore.h" #include "cpu_map.h" @@ -40,6 +41,10 @@ using namespace pcerr; static unsigned maxCpuNum = 0; static vector coreArray; +static std::mutex pmuBdfListMtx; +static std::mutex pmuCoreListMtx; +static std::mutex pmuDeviceDataMtx; + static const string SYS_DEVICES = "/sys/devices/"; static const string SYS_BUS_PCI_DEVICES = "/sys/bus/pci/devices"; static const string SYS_IOMMU_DEVICES = "/sys/class/iommu"; @@ -89,7 +94,7 @@ namespace KUNPENG_PMU { PMU_PCIE_TX_MWR_BW, PMU_SMMU_TRAN}; - static bool IsValidBdf(PmuDeviceMetric metric) + static bool IsBdfMetric(PmuDeviceMetric metric) { return perpcieMetric.find(metric) != perpcieMetric.end(); } @@ -576,7 +581,9 @@ namespace KUNPENG_PMU { int bdfMax = std::get<2>(pciePmu.second); if (bdfmin == -1) { if (bus == pcieBus) { - pcieBdfList.emplace_back(strdup(bdfList[i].c_str())); + char* bdfCopy = new char[bdfList[i].size() + 1]; + strcpy(bdfCopy, bdfList[i].c_str()); + pcieBdfList.emplace_back(bdfCopy); bdfToPcieMap[bdfList[i]] = pciePmu.first; } } else { @@ -585,7 +592,9 @@ namespace KUNPENG_PMU { continue; } if (bus == pcieBus && bdfValue >= bdfmin && bdfValue <= bdfMax) { - pcieBdfList.emplace_back(strdup(bdfList[i].c_str())); + char* bdfCopy = new char[bdfList[i].size() + 1]; + strcpy(bdfCopy, bdfList[i].c_str()); + pcieBdfList.emplace_back(bdfCopy); bdfToPcieMap[bdfList[i]] = pciePmu.first; } } @@ -597,6 +606,8 @@ namespace KUNPENG_PMU { return pcieBdfList.data(); } + // convert smmu name to smmu pmu device name: smmu3.0x -> smmuv3_pmcg_ + // eg: smmu3.0x0000000148000000 <-> smmuv3_pmcg_148020 static int FindSmmuToSmmuPmu(std::string& smmuName, std::string& smmuPmuName) { string smmuPmuKey = ""; @@ -682,7 +693,7 @@ namespace KUNPENG_PMU { PmuDeviceAttr& deviceAttr, const UncoreDeviceConfig& metricConfig) { vector eventList; - if (IsValidBdf(deviceAttr.metric)) { + if (IsBdfMetric(deviceAttr.metric)) { string bdf = deviceAttr.bdf; for (const auto& evt : metricConfig.events) { string device = ""; @@ -786,19 +797,19 @@ namespace KUNPENG_PMU { static int CheckBdf(struct PmuDeviceAttr& deviceAttr) { - if (IsValidBdf(deviceAttr.metric) && deviceAttr.bdf == nullptr) { + if (IsBdfMetric(deviceAttr.metric) && deviceAttr.bdf == nullptr) { New(LIBPERF_ERR_INVALID_PMU_DEVICES_BDF, "When collecting pcie or smmu metric, bdf value can not is nullptr!"); return LIBPERF_ERR_INVALID_PMU_DEVICES_BDF; } if (deviceAttr.metric >= PmuDeviceMetric::PMU_PCIE_RX_MRD_BW && deviceAttr.metric <= PmuDeviceMetric::PMU_PCIE_TX_MWR_BW && !CheckPcieBdf(deviceAttr.bdf)) { New(LIBPERF_ERR_NOT_SOUUPUT_PCIE_BDF, "this bdf not support pcie metric counting." - " Plese use PmuDeviceBdfList to query."); + " Please use PmuDeviceBdfList to query."); return LIBPERF_ERR_NOT_SOUUPUT_PCIE_BDF; } if (deviceAttr.metric == PmuDeviceMetric::PMU_SMMU_TRAN && !CheckSmmuBdf(deviceAttr.bdf)) { New(LIBPERF_ERR_NOT_SOUUPUT_SMMU_BDF, "this bdf not support smmu metric counting." - " Plese use PmuDeviceBdfList to query."); + " Please use PmuDeviceBdfList to query."); return LIBPERF_ERR_NOT_SOUUPUT_SMMU_BDF; } New(SUCCESS); @@ -835,7 +846,7 @@ namespace KUNPENG_PMU { std::unordered_set uniqueSet; for (int i = 0; i < len; ++i) { std::string key = ""; - if (IsValidBdf(attr[i].metric)) { + if (IsBdfMetric(attr[i].metric)) { key = std::to_string(attr[i].metric) + "_" + attr[i].bdf; } else { key = std::to_string(attr[i].metric); @@ -1124,8 +1135,8 @@ namespace KUNPENG_PMU { typedef int (*AggregateMetricCb)(const PmuDeviceMetric metric, const vector &rawData, vector &devData); map computeMetricMap = {{PMU_DDR_READ_BW, DDRBw}, - {PMU_DDR_WRITE_BW, DDRBw}, - {PMU_L3_TRAFFIC, L3Bw}}; + {PMU_DDR_WRITE_BW, DDRBw}, + {PMU_L3_TRAFFIC, L3Bw}}; map aggregateMap = { {PMU_DDR_READ_BW, AggregateByNuma}, {PMU_DDR_WRITE_BW, AggregateByNuma}, @@ -1160,7 +1171,7 @@ namespace KUNPENG_PMU { } // For pcie events, check if event is related with specifi bdf. - if (IsValidBdf(devAttr.metric)) { + if (IsBdfMetric(devAttr.metric)) { auto bdfStr = ExtractEvtStr("bdf", evtName); if (bdfStr.empty()) { bdfStr = ExtractEvtStr("filter_stream_id", evtName); @@ -1237,7 +1248,7 @@ namespace KUNPENG_PMU { if (perClusterMetric.find(devAttr.metric) != perClusterMetric.end()) { devData.clusterId = pmuData[i].cpuTopo->coreId / clusterWidth; } - if (IsValidBdf(devAttr.metric)) { + if (IsBdfMetric(devAttr.metric)) { devData.bdf = devAttr.bdf; } devDataList.emplace_back(devData); @@ -1253,6 +1264,7 @@ using namespace KUNPENG_PMU; const char** PmuDeviceBdfList(enum PmuBdfType bdfType, unsigned *numBdf) { try { + lock_guard lg(pmuBdfListMtx); SetWarn(SUCCESS); int err = 0; if (bdfType == PmuBdfType::PMU_BDF_TYPE_PCIE) { @@ -1284,6 +1296,24 @@ const char** PmuDeviceBdfList(enum PmuBdfType bdfType, unsigned *numBdf) } } +static void PmuBdfListFreeSingle(vector &bdfList) +{ + for (auto& bdf : bdfList) { + if (bdf != NULL && bdf[0] != '\0') { + delete[] bdf; + } + } + bdfList.clear(); +} + +void PmuDeviceBdfListFree() +{ + lock_guard lg(pmuBdfListMtx); + PmuBdfListFreeSingle(pcieBdfList); + PmuBdfListFreeSingle(smmuBdfList); + New(SUCCESS); +} + int PmuDeviceOpen(struct PmuDeviceAttr *attr, unsigned len) { SetWarn(SUCCESS); @@ -1389,6 +1419,7 @@ int PmuGetDevMetric(struct PmuData *pmuData, unsigned len, auto dataPtr = devData.data(); int retLen = devData.size(); // Make relationship between raw pointer and vector, for DevDataFree. + lock_guard lg(pmuDeviceDataMtx); deviceDataMap[dataPtr] = move(devData); *data = dataPtr; New(SUCCESS); @@ -1402,6 +1433,7 @@ int PmuGetDevMetric(struct PmuData *pmuData, unsigned len, void DevDataFree(struct PmuDeviceData *data) { SetWarn(SUCCESS); + lock_guard lg(pmuDeviceDataMtx); if (deviceDataMap.find(data) != deviceDataMap.end()) { deviceDataMap.erase(data); } @@ -1442,8 +1474,8 @@ static void InitializeCoreArray() int PmuGetClusterCore(unsigned clusterId, unsigned **coreList) { - try - { + try { + lock_guard lg(pmuCoreListMtx); InitializeCoreArray(); bool hyperThread = false; int err = HyperThreadEnabled(hyperThread); @@ -1478,6 +1510,7 @@ int PmuGetClusterCore(unsigned clusterId, unsigned **coreList) int PmuGetNumaCore(unsigned nodeId, unsigned **coreList) { try { + lock_guard lg(pmuCoreListMtx); string nodeListFile = "/sys/devices/system/node/node" + to_string(nodeId) + "/cpulist"; ifstream in(nodeListFile); if (!in.is_open()) { diff --git a/pmu/pmu_metric.h b/pmu/pmu_metric.h new file mode 100644 index 0000000..593f97d --- /dev/null +++ b/pmu/pmu_metric.h @@ -0,0 +1,23 @@ +/****************************************************************************** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * libkperf licensed under the Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR + * PURPOSE. + * See the Mulan PSL v2 for more details. + * Author: Mr.Lei + * Create: 2024-04-17 + * Description: definition of pmu metric some interface for free + ******************************************************************************/ + + #ifndef PMU_METRIC_H + #define PMU_METRIC_H + + // free Bdf List for PmuClose interface + void PmuDeviceBdfListFree(); + + #endif // PMU_METRIC_H + \ No newline at end of file diff --git a/python/modules/_libkperf/Pmu.py b/python/modules/_libkperf/Pmu.py index 5c381e0..b2236d6 100644 --- a/python/modules/_libkperf/Pmu.py +++ b/python/modules/_libkperf/Pmu.py @@ -1638,22 +1638,29 @@ def PmuGetFieldExp(rawData: ctypes.POINTER(CtypesSampleRawData), field_name: str return SampleRawField.from_sample_raw_field(pointer_field.contents) -def PmuDeviceBdfList(bdf_type: int) -> List[str]: +def PmuDeviceBdfListFree() -> None: """ - Query all available BDF (Bus:Device.Function) list from system. - - Args: - bdf_type: Type of BDF chosen by user (PMU_BDF_TYPE_PCIE or PMU_BDF_TYPE_SMMU) - - Returns: - List of BDF strings + void PmuDeviceBdfListFree() + """ + c_PmuDeviceBdfListFree = kperf_so.PmuDeviceBdfListFree + c_PmuDeviceBdfListFree.argtypes = [] + c_PmuDeviceBdfListFree.restype = None + + c_PmuDeviceBdfListFree() + +def PmuDeviceBdfList(bdf_type: int) -> Iterator[str]: + """ + const char** PmuDeviceBdfList(enum PmuBdfType bdfType, unsigned *numBdf); """ c_PmuDeviceBdfList = kperf_so.PmuDeviceBdfList - c_PmuDeviceBdfList.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_uint)] + c_PmuDeviceBdfList.argtypes = [ctypes.c_int] c_PmuDeviceBdfList.restype = ctypes.POINTER(ctypes.c_char_p) + c_bdf_type = ctypes.c_int(bdf_type) c_num_bdf = ctypes.c_uint() + c_bdf_list = c_PmuDeviceBdfList(c_bdf_type, ctypes.byref(c_num_bdf)) + return [c_bdf_list[i].decode(UTF_8) for i in range(c_num_bdf.value)] @@ -1877,6 +1884,7 @@ __all__ = [ 'ImplPmuDeviceData', 'PmuDeviceData', 'PmuDeviceBdfList', + 'PmuDeviceBdfListFree', 'PmuDeviceOpen', 'PmuGetDevMetric', 'PmuGetCpuFreq', diff --git a/python/modules/kperf/pmu.py b/python/modules/kperf/pmu.py index f27a8b1..975b122 100644 --- a/python/modules/kperf/pmu.py +++ b/python/modules/kperf/pmu.py @@ -436,7 +436,7 @@ def get_field_exp(pmu_data: _libkperf.ImplPmuData, field_name: str) -> SampleRaw """ return _libkperf.PmuGetFieldExp(pmu_data.rawData.c_pmu_data_rawData, field_name) -def device_bdf_list(bdf_type: PmuBdfType) -> List[str]: +def device_bdf_list(bdf_type: PmuBdfType) -> Iterator[str]: """ Query all available BDF (Bus:Device.Function) list from system. :param bdf_type: type of bdf chosen by user diff --git a/python/tests/test_metric.py b/python/tests/test_metric.py index 8b45cc1..4a878ae 100644 --- a/python/tests/test_metric.py +++ b/python/tests/test_metric.py @@ -76,17 +76,19 @@ def print_dev_data_details(dev_data): def test_get_pcie_bdf_list(): bdf_type = kperf.PmuBdfType.PMU_BDF_TYPE_PCIE - bdf_list = kperf.device_bdf_list(bdf_type) + bdf_list_iter = kperf.device_bdf_list(bdf_type) print(kperf.error()) - print(len(bdf_list)) - assert bdf_list is not None, f"Expected non-null bdf_list, but got {bdf_list}" + for bdf in bdf_list_iter: + print(f"bdf: {bdf}") + assert bdf_list_iter is not None, f"Expected non-null bdf_list_iter, but got {bdf_list_iter}" def test_get_smmu_bdf_list(): bdf_type = kperf.PmuBdfType.PMU_BDF_TYPE_SMMU - bdf_list = kperf.device_bdf_list(bdf_type) + bdf_list_iter = kperf.device_bdf_list(bdf_type) print(kperf.error()) - print(len(bdf_list)) - assert bdf_list is not None, f"Expected non-null bdf_list, but got {bdf_list}" + for bdf in bdf_list_iter: + print(f"bdf: {bdf}") + assert bdf_list_iter is not None, f"Expected non-null bdf_list_iter, but got {bdf_list_iter}" def test_get_cpu_freq(): core = 6 @@ -127,6 +129,7 @@ def test_collect_ddr_bandwidth(): assert dev_data[0].numaId == 0 assert dev_data[0].mode == kperf.PmuMetricMode.PMU_METRIC_NUMA print_dev_data_details(dev_data) + kperf.close(pd) def test_collect_l3_latency(): dev_attr = [ @@ -145,6 +148,7 @@ def test_collect_l3_latency(): assert len(dev_data) == get_cluster_nums() assert dev_data[0].clusterId == 0 print_dev_data_details(dev_data) + kperf.close(pd) def test_collect_l3_latency_and_ddr(): dev_attr = [ @@ -163,6 +167,7 @@ def test_collect_l3_latency_and_ddr(): dev_data = kperf.get_device_metric(ori_data, dev_attr) assert len(dev_data) == get_cluster_nums() + 4 print_dev_data_details(dev_data) + kperf.close(pd) def test_collect_l3_traffic(): @@ -182,6 +187,7 @@ def test_collect_l3_traffic(): assert len(dev_data) == get_cpu_nums() assert dev_data[0].mode == kperf.PmuMetricMode.PMU_METRIC_CORE print_dev_data_details(dev_data) + kperf.close(pd) def test_collect_l3_traffic_and_l3_ref(): @@ -205,6 +211,7 @@ def test_collect_l3_traffic_and_l3_ref(): assert dev_data[get_cpu_nums()].metric == kperf.PmuDeviceMetric.PMU_L3_REF assert dev_data[get_cpu_nums()].mode == kperf.PmuMetricMode.PMU_METRIC_CORE print_dev_data_details(dev_data) + kperf.close(pd) def test_collect_l3_latency_and_l3_miss(): @@ -225,12 +232,13 @@ def test_collect_l3_latency_and_l3_miss(): data_len = get_cpu_nums() + get_cluster_nums() assert len(dev_data) == data_len print_dev_data_details(dev_data) + kperf.close(pd) def test_get_metric_pcie_bandwidth(): - bdf_list = kperf.device_bdf_list(kperf.PmuBdfType.PMU_BDF_TYPE_PCIE) + bdf_list_iter = kperf.device_bdf_list(kperf.PmuBdfType.PMU_BDF_TYPE_PCIE) dev_attr = [ kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_PCIE_RX_MRD_BW, bdf=bdf) - for bdf in bdf_list + for bdf in bdf_list_iter ] pd = kperf.device_open(dev_attr) print(kperf.error()) @@ -242,14 +250,15 @@ def test_get_metric_pcie_bandwidth(): assert len(ori_data) != -1, f"Expected non-negative ori_len, but got {len(ori_data)}" dev_data = kperf.get_device_metric(ori_data, dev_attr) - assert len(dev_data) == len(bdf_list) + assert len(dev_data) == len(dev_attr) print_dev_data_details(dev_data) + kperf.close(pd) def test_get_metric_smmu_transaction(): - bdf_list = kperf.device_bdf_list(kperf.PmuBdfType.PMU_BDF_TYPE_SMMU) + bdf_list_iter = kperf.device_bdf_list(kperf.PmuBdfType.PMU_BDF_TYPE_SMMU) dev_attr = [ kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_SMMU_TRAN, bdf=bdf) - for bdf in bdf_list + for bdf in bdf_list_iter ] pd = kperf.device_open(dev_attr) print(kperf.error()) @@ -261,8 +270,9 @@ def test_get_metric_smmu_transaction(): assert len(ori_data) != -1, f"Expected non-negative ori_len, but got {len(ori_data)}" dev_data = kperf.get_device_metric(ori_data, dev_attr) - assert len(dev_data) == len(bdf_list) + assert len(dev_data) == len(dev_attr) print_dev_data_details(dev_data) + kperf.close(pd) if __name__ == '__main__': # 提示用户使用pytest 运行测试文件 diff --git a/test/test_perf/test_metric.cpp b/test/test_perf/test_metric.cpp index 3f4efeb..85b8653 100644 --- a/test/test_perf/test_metric.cpp +++ b/test/test_perf/test_metric.cpp @@ -126,6 +126,9 @@ TEST_F(TestMetric, CollectDDRBandwidth) ASSERT_EQ(devData[2].mode, PMU_METRIC_NUMA); ASSERT_EQ(devData[3].numaId, 3); ASSERT_EQ(devData[3].mode, PMU_METRIC_NUMA); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, CollectL3Latency) @@ -151,6 +154,9 @@ TEST_F(TestMetric, CollectL3Latency) ASSERT_EQ(devData[0].mode, PMU_METRIC_CLUSTER); ASSERT_EQ(devData[clusterCount - 1].clusterId, clusterCount - 1); ASSERT_EQ(devData[clusterCount - 1].mode, PMU_METRIC_CLUSTER); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, CollectL3LatencyAndDDR) @@ -179,6 +185,9 @@ TEST_F(TestMetric, CollectL3LatencyAndDDR) ASSERT_EQ(devData[0].mode, PMU_METRIC_CLUSTER); ASSERT_EQ(devData[clusterCount].metric, PMU_DDR_WRITE_BW); ASSERT_EQ(devData[clusterCount].mode, PMU_METRIC_NUMA); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, CollectL3Traffic) @@ -198,6 +207,9 @@ TEST_F(TestMetric, CollectL3Traffic) auto len = PmuGetDevMetric(oriData, oriLen, &devAttr, 1, &devData); ASSERT_EQ(len, GetCpuNums()); ASSERT_EQ(devData[0].mode, PMU_METRIC_CORE); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, CollectL3TrafficAndL3REF) @@ -222,6 +234,9 @@ TEST_F(TestMetric, CollectL3TrafficAndL3REF) ASSERT_EQ(devData[0].mode, PMU_METRIC_CORE); ASSERT_EQ(devData[cpuNum].metric, PMU_L3_REF); ASSERT_EQ(devData[cpuNum].mode, PMU_METRIC_CORE); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, CollectL3LatencyAndL3Miss) @@ -250,6 +265,9 @@ TEST_F(TestMetric, CollectL3LatencyAndL3Miss) ASSERT_EQ(devData[0].mode, PMU_METRIC_CLUSTER); ASSERT_EQ(devData[clusterCount].metric, PMU_L3_MISS); ASSERT_EQ(devData[clusterCount].mode, PMU_METRIC_CORE); + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, GetMetricPcieBandwidth) @@ -279,6 +297,12 @@ TEST_F(TestMetric, GetMetricPcieBandwidth) ASSERT_EQ(devData[0].metric, PMU_PCIE_RX_MRD_BW); ASSERT_EQ(devData[0].mode, PMU_METRIC_BDF); ASSERT_TRUE(strcmp(devData[0].bdf, bdfList[0]) == 0); + for (int i = 0; i < bdfLen; ++i) { + free(devAttr[i].bdf); + } + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } TEST_F(TestMetric, GetMetricSmmuTransaction) @@ -288,13 +312,13 @@ TEST_F(TestMetric, GetMetricSmmuTransaction) bdfList = PmuDeviceBdfList(PMU_BDF_TYPE_SMMU, &bdfLen); cout << Perror() << endl; ASSERT_NE(bdfList, nullptr); - PmuDeviceAttr devAttr[2] = {}; - devAttr[0].metric = PMU_SMMU_TRAN; - devAttr[0].bdf = strdup(bdfList[0]); - devAttr[1].metric = PMU_SMMU_TRAN; - devAttr[1].bdf = strdup(bdfList[1]); + PmuDeviceAttr devAttr[bdfLen] = {}; + for (int i = 0; i < bdfLen; ++i) { + devAttr[i].metric = PMU_SMMU_TRAN; + devAttr[i].bdf = strdup(bdfList[i]); + } - int pd = PmuDeviceOpen(devAttr, 2); + int pd = PmuDeviceOpen(devAttr, bdfLen); cout << Perror() << endl; ASSERT_NE(pd, -1); PmuEnable(pd); @@ -305,14 +329,18 @@ TEST_F(TestMetric, GetMetricSmmuTransaction) ASSERT_NE(oriLen, -1); PmuDeviceData *devData = nullptr; - auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); - ASSERT_EQ(len, 2); + auto len = PmuGetDevMetric(oriData, oriLen, devAttr, bdfLen, &devData); + ASSERT_EQ(len, bdfLen); ASSERT_EQ(devData[0].metric, PMU_SMMU_TRAN); ASSERT_EQ(devData[0].mode, PMU_METRIC_BDF); ASSERT_TRUE(strcmp(devData[0].bdf, devAttr[0].bdf) == 0); ASSERT_EQ(devData[1].metric, PMU_SMMU_TRAN); ASSERT_EQ(devData[1].mode, PMU_METRIC_BDF); ASSERT_TRUE(strcmp(devData[1].bdf, devAttr[1].bdf) == 0); - delete[] devAttr[0].bdf; - delete[] devAttr[1].bdf; + for (int i = 0; i < bdfLen; ++i) { + free(devAttr[i].bdf); + } + DevDataFree(devData); + PmuDataFree(oriData); + PmuClose(pd); } \ No newline at end of file -- Gitee From 51fa2c9742febe8eb5bd086204234b081c796e68 Mon Sep 17 00:00:00 2001 From: ganlixiong Date: Thu, 17 Apr 2025 19:32:34 +0800 Subject: [PATCH 04/14] =?UTF-8?q?=E8=A1=A5=E5=85=85L3=E3=80=81DDR=E3=80=81?= =?UTF-8?q?PCIE=E7=9A=84=E4=BD=BF=E7=94=A8=E8=AF=B4=E6=98=8E=E5=88=B0Detai?= =?UTF-8?q?ls=5FUsage.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.en.md | 2 +- README.md | 2 +- docs/Details_Usage.md | 430 +++++++++++++++++++++++------------------- pmu/pmu_list.cpp | 16 ++ 4 files changed, 258 insertions(+), 192 deletions(-) diff --git a/README.en.md b/README.en.md index 4bd9555..f6a888e 100644 --- a/README.en.md +++ b/README.en.md @@ -261,7 +261,7 @@ func main() { } for _, o := range dataVo.GoData { - fmt.Printf("event: %v count: %v", o.Evt, o.Count) + fmt.Printf("event: %v count: %v\n", o.Evt, o.Count) } kperf.PmuDataFree(dataVo) kperf.PmuClose(fd) diff --git a/README.md b/README.md index d93a64e..0d5c764 100644 --- a/README.md +++ b/README.md @@ -237,7 +237,7 @@ func main() { } for _, o := range dataVo.GoData { - fmt.Printf("event: %v count: %v", o.Evt, o.Count) + fmt.Printf("event: %v count: %v\n", o.Evt, o.Count) } kperf.PmuDataFree(dataVo) kperf.PmuClose(fd) diff --git a/docs/Details_Usage.md b/docs/Details_Usage.md index 37713c9..289c03c 100644 --- a/docs/Details_Usage.md +++ b/docs/Details_Usage.md @@ -600,162 +600,230 @@ pmu_attr = kperf.PmuAttr(evtList=evtList, includeNewFork=True) 注意,该功能是针对Counting模式,因为Sampling和SPE Sampling本身就会采集子线程的数据。 ### 采集DDRC带宽 -基于uncore事件可以计算DDRC的访存带宽,不同硬件平台有不同的计算方式。 -鲲鹏芯片上的访存带宽公式可以参考kernel的tools/perf/pmu-events/arch/arm64/hisilicon/hip09/sys/uncore-ddrc.json: -```json - { - "MetricExpr": "flux_wr * 32 / duration_time", - "BriefDescription": "Average bandwidth of DDRC memory write(Byte/s)", - "Compat": "0x00000030", - "MetricGroup": "DDRC", - "MetricName": "ddrc_bw_write", - "Unit": "hisi_sccl,ddrc" - }, - { - "MetricExpr": "flux_rd * 32 / duration_time", - "BriefDescription": "Average bandwidth of DDRC memory read(Byte/s)", - "Compat": "0x00000030", - "MetricGroup": "DDRC", - "MetricName": "ddrc_bw_read", - "Unit": "hisi_sccl,ddrc" - }, -``` - -根据公式,采集flux_wr和flux_rd事件,用于计算带宽: +鲲鹏上提供了DDRC的pmu设备,用于采集DDR的性能数据,比如带宽等。libkperf提供了API,用于获取每个numa的DDR带宽数据。 + +参考代码: ```c++ // c++代码示例 - - vector evts = { - "hisi_sccl1_ddrc/flux_rd/", - "hisi_sccl3_ddrc/flux_rd/", - "hisi_sccl5_ddrc/flux_rd/", - "hisi_sccl7_ddrc/flux_rd/", - "hisi_sccl1_ddrc/flux_wr/", - "hisi_sccl3_ddrc/flux_wr/", - "hisi_sccl5_ddrc/flux_wr/", - "hisi_sccl7_ddrc/flux_wr/" - }; // 采集hisi_scclX_ddrc设备下的flux_rd和flux_wr,具体设备名称因硬件而异,可以在/sys/devices/下查询。 - - PmuAttr attr = {0}; - attr.evtList = evts.data(); - attr.numEvt = evts.size(); - - int pd = PmuOpen(COUNTING, &attr); - if (pd == -1) { - cout << Perror() << "\n"; - return; - } +PmuDeviceAttr devAttr[2]; +// DDR读带宽 +devAttr[0].metric = PMU_DDR_READ_BW; +// DDR写带宽 +devAttr[1].metric = PMU_DDR_WRITE_BW; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 2); +// 开始采集 +PmuEnable(pd); +sleep(1); +// 读取原始信息 +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 2, &devData); +// 对于4个numa的服务器,devData的长度为8.前4个是读带宽,后4个是写带宽。 +for (int i=0;i<4;++i) { + // numaId表示数据对应的numa节点。 + // count是距离上次采集的DDR总读/写包长,单位是Byte, + // 需要除以时间间隔得到带宽(这里的时间间隔是1秒)。 + cout << "read bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n"; +} +for (int i=4;i<8;++i) { + cout << "write bandwidth(" << devData[i].numaId << "): " << devData[i].count/1024/1024 << "M/s\n"; +} +DevDataFree(devData); +PmuDataFree(oriData); +PmuDisable(pd); +``` - PmuEnable(pd); - for (int i=0;i<60;++i) { - sleep(1); - PmuData *data = nullptr; - int len = PmuRead(pd, &data); - // 有8个uncore事件,所以data的长度等于8. - // 前4个是4个numa的read带宽,后4个是4个numa的write带宽。 - for (int j=0;j<4;++j) { - printf("read bandwidth: %f M/s\n", (float)data[j].count*32/1024/1024); - } - for (int j=4;j<8;++j) { - printf("write bandwidth: %f M/s\n", (float)data[j].count*32/1024/1024); - } - PmuDataFree(data); +```python +# python代码示例 +dev_attr = [ + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_READ_BW), + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW) +] +pd = kperf.device_open(dev_attr) +kperf.enable(pd) +time.sleep(1) +kperf.disable(pd) +ori_data = kperf.read(pd) +dev_data = kperf.get_device_metric(ori_data, dev_attr) +for data in dev_data.iter: + if data.metric == kperf.PmuDeviceMetric.PMU_DDR_READ_BW: + print(f"read bandwidth({data.numaId}): {data.count/1024/1024} M/s") + if data.metric == kperf.PmuDeviceMetric.PMU_DDR_WRITE_BW: + print(f"write bandwidth({data.numaId}): {data.count/1024/1024} M/s") +``` + +```go +// go代码用例 +deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_DDR_READ_BW}, kperf.PmuDeviceAttr{Metric: kperf.PMU_DDR_WRITE_BW}} +fd, _ := kperf.PmuDeviceOpen(deviceAttrs) +kperf.PmuEnable(fd) +time.Sleep(1 * time.Second) +kperf.PmuDisable(fd) +dataVo, _ := kperf.PmuRead(fd) +deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs) +for _, v := range deivceDataVo.GoDeviceData { + if v.Metric == kperf.PMU_DDR_READ_BW { + fmt.Printf("read bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024) + } + if v.Metric == kperf.PMU_DDR_WRITE_BW { + fmt.Printf("write bandwidth(%v): %v M/s\n", v.NumaId, v.Count/1024/1024) } - PmuDisable(pd); - PmuClose(pd); +} +kperf.DevDataFree(deivceDataVo) +kperf.PmuDataFree(dataVo) +kperf.PmuClose(fd) +``` + +执行上述代码,输出的结果类似如下: +``` +read bandwidth(0): 17.32 M/s +read bandwidth(1): 5.43 M/s +read bandwidth(2): 2.83 M/s +read bandwidth(3): 4.09 M/s +write bandwidth(0): 4.35 M/s +write bandwidth(1): 2.29 M/s +write bandwidth(2): 0.84 M/s +write bandwidth(3): 0.97 M/s +``` + +### 采集L3 cache的时延 +libkperf提供了采集L3 cache平均时延的能力,用于分析访存型应用的性能瓶颈。 +采集是以cluster为粒度,每个cluster包含4个cpu core(如果开启了超线程则是8个),可以通过PmuGetClusterCore来获取cluster id对应的core id。 + +参考代码: +```c++ +// c++代码示例 +PmuDeviceAttr devAttr[1]; +// L3平均时延 +devAttr[0].metric = PMU_L3_LAT; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 1); +// 开始采集 +PmuEnable(pd); +sleep(1); +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData); +// devData的长度等于cluster个数 +for (int i=0;i= 4 and j < 8: - print(f"write bandwidth: {bandwidth} M/s\n") - j += 1 - kperf.disable(pd) - kperf.close(pd) +dev_attr = [ + kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_LAT) +] +pd = kperf.device_open(dev_attr) +kperf.enable(pd) +time.sleep(1) +kperf.disable(pd) +ori_data = kperf.read(pd) +dev_data = kperf.get_device_metric(ori_data, dev_attr) +for data in dev_data.iter: + print(f"L3 latency({data.clusterId}): {data.count} cycles") ``` ```go // go代码用例 -import "libkperf/kperf" -import "time" -import "fmt" +deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_L3_LAT}} +fd, _ := kperf.PmuDeviceOpen(deviceAttrs) +kperf.PmuEnable(fd) +time.Sleep(1 * time.Second) +kperf.PmuDisable(fd) +dataVo, _ := kperf.PmuRead(fd) +deivceDataVo, _ := kperf.PmuGetDevMetric(dataVo, deviceAttrs) +for _, v := range deivceDataVo.GoDeviceData { + fmt.Printf("L3 latency(%v): %v cycles\n", v.ClusterId, v.Count) +} +kperf.DevDataFree(deivceDataVo) +kperf.PmuDataFree(dataVo) +kperf.PmuClose(fd) +``` -func main() { - evtList := []string{"hisi_sccl1_ddrc/flux_rd/", - "hisi_sccl3_ddrc/flux_rd/", - "hisi_sccl5_ddrc/flux_rd/", - "hisi_sccl7_ddrc/flux_rd/", - "hisi_sccl1_ddrc/flux_wr/", - "hisi_sccl3_ddrc/flux_wr/", - "hisi_sccl5_ddrc/flux_wr/", - "hisi_sccl7_ddrc/flux_wr/"} - attr := kperf.PmuAttr{EvtList: evtList} - pd, err := kperf.PmuOpen(kperf.COUNT, attr) - if err != nil { - fmt.Printf("kperf pmuopen sample failed, expect err is nil, but is %v\n", err) - return - } - kperf.PmuEnable(pd) +执行上述代码,输出的结果类似如下: +``` +L3 latency(0): 101 cycles +L3 latency(1): 334.6 cycles +L3 latency(2): 267.8 cycles +L3 latency(3): 198.4 cycles +... +``` - for i := 0; i < 60; i++ { - time.Sleep(time.Second) - dataVo, err := kperf.PmuRead(pd) - if err != nil { - fmt.Printf("kperf pmuread failed, expect err is nil, but is %v\n", err) - } +### 采集PCIE带宽 +libkperf提供了采集PCIE带宽的能力,采集tx和rx方向的读写带宽,用于监控外部设备(nvme、gpu等)的带宽。 +并不是所有的PCIE设备都可以被采集带宽,鲲鹏的pmu设备只覆盖了一部分PCIE设备,可以通过PmuDeviceBdfList来获取当前环境可采集的PCIE设备或Root port。 - j := 0 - for _, o := range dataVo.GoData { - bandwith := o.Count * 32 / 1024 / 1024 - if j < 4 { - fmt.Printf("read bandwidth: %v M/s\n", bandwith) - } - if j >= 4 && j < 8 { - fmt.Printf("write bandwidth: %v M/s\n", bandwith) - } - j += 1 - } - } - kperf.PmuDisable(pd) - kperf.PmuClose(pd) +参考代码: +```c++ +// c++代码示例 +PmuDeviceAttr devAttr[1]; +// 采集PCIE设备RX的读带宽 +devAttr[0].metric = PMU_PCIE_RX_MRD_BW; +// 设置PCIE的bdf号 +devAttr[0].bdf = "16:04.0"; +// 初始化采集任务 +int pd = PmuDeviceOpen(devAttr, 1); +// 开始采集 +PmuEnable(pd); +sleep(1); +PmuData *oriData = nullptr; +int oriLen = PmuRead(pd, &oriData); +PmuDeviceData *devData = nullptr; +auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData); +// devData的长度等于pcie设备的个数 +for (int i=0;i0x400804 1 ``` -### Blocked Sample采样 +### IO和计算热点混合采样(Blocked Sample) Blocked Sample是一种新增的采样模式,该模式下会同时采集进程处于on cpu和off cpu数据,通过配置blockedSample字段去进行使能,去同时采集cycles和context-switches事件,换算off cpu的period数据。 -说明: - -1、只支持SAMPLING模式采集 - -2、只支持对进程分析,不支持对系统分析 - -使用示例: -```bash -cd example -# 运行C++用例,并分析热点 -bash run.sh all -# 运行python用例,并分析热点 -bash run.sh all python=true +详细使用方法可以参考example/pmu_hotspot.cpp +编译命令: ``` - -### Uncore事件采集能力增强 -1、支持可配置化uncore事件配置,比如如下形式进行事件配置: -```bash -smmuv3_pmcg_100020/transaction,filter_enable=1,filter_stream_id=0x7d/ +g++ -g pmu_hotspot.cpp -o pmu_hotspot -I /path/to/libkperf/include -L /path/to/libkperf/lib -lkperf -lsym ``` -2、支持采集和查询L3、DDR、SMMU、PCIE性能数据,采集如下性能数据: -- 每个core的L3带宽、hit、miss,支持920和920高性能版 -- 每个numa的L3 latency,支持920高性能版 -- 每个numa的DDR读写带宽,支持920和920高性能版 -- 指定bdf号的smmu的地址转换次数,支持920和920高性能版 -- 指定bdf号的pcie rx、tx方向的读写带宽,支持920高性能版 +对于例子: +``` +thread1: + busy_io + compute + while + write + fsync +thread2 + cpu_compute + while + compute +``` +既包含计算(compute)也包含IO(write, fsync),如果用perf采集,只能采集到on cpu的数据: +|overhead|Shared Object|Symbol| +|--------|-------------|------| +|99.94%|test_io|compute| +|0.03%|libpthread-2.17.so|__pthread_enable_asynccancel| +|0.00%|test_io|busy_io| -代码示例: -```C++ - // C++ 代码示例 - PmuDeviceAttr devAttr = {}; - devAttr.metric = PMU_L3_TRAFFIC; - int pd = PmuDeviceOpen(&devAttr, 1); +使用pmu_hotspot采集: +``` +pmu_hotspot 5 1 1 +``` - PmuEnable(pd); - sleep(1); - PmuDisable(pd); +输出结果: +|overhead|Shared Object|Symbol| +|--------|-------------|------| +|54.74%|libpthread-2.17.so|fsync| +|27.18%|test_io|compute| +采集到了fsync,得知该进程的IO占比大于计算占比。 - PmuData* oriData = nullptr; - int oriLen = PmuRead(pd, &oriData); +限制: - PmuDeviceData *devData = nullptr; - auto len = PmuGetDevMetric(oriData, oriLen, &devAttr, 1, &devData); -``` +1、只支持SAMPLING模式采集 -```python - # python 代码示例 - dev_attr = [ - kperf.PmuDeviceAttr(metric=kperf.PmuDeviceMetric.PMU_L3_TRAFFIC) - ] - pd = kperf.device_open(dev_attr) - - kperf.enable(pd) - time.sleep(1) - kperf.disable(pd) - ori_data = kperf.read(pd) - - - dev_data = kperf.get_device_metric(ori_data, dev_attr) -``` \ No newline at end of file +2、只支持对进程分析,不支持对系统分析 diff --git a/pmu/pmu_list.cpp b/pmu/pmu_list.cpp index 09a32f0..aa38b70 100644 --- a/pmu/pmu_list.cpp +++ b/pmu/pmu_list.cpp @@ -281,6 +281,19 @@ namespace KUNPENG_PMU { return userData; } + static void TrimKernelStack(PmuData &data) + { + auto stack = data.stack; + while (stack != nullptr && stack->symbol != nullptr) { + if (strcmp(stack->symbol->module, "[kernel]") == 0) { + stack = stack->next; + continue; + } + data.stack = stack; + break; + } + } + void HandleBlockData(std::vector& pmuData, std::vector& switchData) { std::sort(switchData.begin(), switchData.end(), [](const PmuSwitchData& a, const PmuSwitchData& b) { @@ -332,6 +345,9 @@ namespace KUNPENG_PMU { DBG_PRINT("New tid encountered: tid=%d\n", currentTid); } if (strcmp(item.evt, "context-switches") == 0) { + // Convert stack from 'schedule[kernel] -> futex_wait[kernel] -> ...[kernel] -> lock_wait -> start_thread' + // to 'lock_wait -> start_thread', only keeping user stack. + TrimKernelStack(item); // Before the context-switches event, there is only one cycles event, which we need to ignore. if (currentTs == 0) { DBG_PRINT("Ignoring first cycles event for tid=%d\n", item.tid); -- Gitee From e1c2fb22d43d1d7971903c712c21d071c2192298 Mon Sep 17 00:00:00 2001 From: echodo <2220386943@qq.com> Date: Thu, 17 Apr 2025 20:26:16 +0800 Subject: [PATCH 05/14] =?UTF-8?q?=E8=A7=A3=E5=86=B3bdf=E8=BE=93=E5=85=A5?= =?UTF-8?q?=E4=B8=BA=E7=A9=BA,=E4=BD=86=E9=9D=9Enullptr=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/Go_API.md | 149 +++++++++++++++++++++++++- go/src/libkperf/kperf/kperf.go | 57 +++++----- go/src/libkperf_test/libkperf_test.go | 2 +- 3 files changed, 181 insertions(+), 27 deletions(-) diff --git a/docs/Go_API.md b/docs/Go_API.md index 4db378a..fa1ba8f 100644 --- a/docs/Go_API.md +++ b/docs/Go_API.md @@ -127,7 +127,7 @@ func PmuRead(fd int) (PmuDataVo, error) * Period uint64 采样间隔 * Count uint64 计数 * CountPercent float64 计数比值,使能时间/运行时间 - * CpuTopo CpuTopolopy + * CpuTopo CpuTopology * CoreId 系统核ID * NumaId numa ID * SocketId socket ID @@ -267,6 +267,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) * FuncName string 系统调用函数名 * ElapsedTime float64 耗时时间 + * StartTs 开始时间戳 * Pid int 进程id * Tid int 线程id * Cpu int cpu号 @@ -280,7 +281,7 @@ if err != nil { } for _, v := range traceList.GoTraceData { - fmt.Printf("funcName: %v, elapsedTime: %v ms pid: %v tid: %v, cpu: %v comm: %v\n", v.FuncName, v.ElapsedTime, v.Pid, v.Tid, v.Cpu, v.Comm) + fmt.Printf("funcName: %v, elapsedTime: %v ms startTs: %v pid: %v tid: %v, cpu: %v comm: %v\n", v.FuncName, v.ElapsedTime, v.StartTs, v.Pid, v.Tid, v.Cpu, v.Comm) } ``` @@ -307,3 +308,147 @@ func main() { } } ``` + +### kperf.PmuDeviceBdfList + +func PmuDeviceBdfList(bdfType C.enum_PmuBdfType) ([]string, error) 从系统中查找所有的bdf列表 +* bdfType C.enum_PmuBdfType + * PMU_BDF_TYPE_PCIE PCIE设备对应的bdf + * PMU_BDF_TYPE_SMMU SMMU设备对应的bdf +```go +import "libkperf/kperf" +import "fmt" + +func main() { + pcieBdfList, err := kperf.PmuDeviceBdfList(kperf.PMU_BDF_TYPE_PCIE) + if err != nil { + fmt.Printf("kperf GetDeviceBdfList failed, expect err is nil, but is %v\n", err) + } + for _, v := range pcieBdfList { + fmt.Printf("bdf is %v\n", v) + } +} +``` +### kperf.PmuDeviceOpen + +func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) 初始化采集uncore事件指标的能力 + +* type PmuDeviceAttr struct: + * Metic: 指定需要采集的指标 + * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes + * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes + * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes + * PMU_L3_MISS 采集每个core的L3的miss数量,单位:count + * PMU_L3_REF 采集每个core的L3的总访问数量,单位:count + * PMU_L3_LAT 采集每个numa的L3的总时延,单位:cycles + * PMU_PCIE_RX_MRD_BW 采集pcie设备的rx方向上的读带宽,单位:Bytes/ns + * PMU_PCIE_RX_MWR_BW 采集pcie设备的rx方向上的写带宽,单位:Bytes/ns + * PMU_PCIE_TX_MRD_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns + * PMU_PCIE_TX_MWR_BW 采集pcie设备的tx方向上的读带宽,单位:Bytes/ns + * PMU_SMMU_TRAN 采集指定smmu设备的地址转换次数,单位:count + * Bdf: 指定需要采集设备的bdf号,只对pcie和smmu指标有效 +* 返回值是int和error,pd > 0表示初始化成功,pd == -1初始化失败,可通过kperf.error()查看错误信息,以下是一个kperf.device_open的示例 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + deviceAttrs := []kperf.PmuDeviceAttr{kperf.PmuDeviceAttr{Metric: kperf.PMU_L3_LAT}} + fd, err := kperf.PmuDeviceOpen(deviceAttrs) + if err != nil { + fmt.Printf("kperf PmuDeviceOpen failed, expect err is nil, but is %v\n", err) + } +} +``` + +### kperf.PmuGetDevMetric + +func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDataVo, error) 对原始read接口的数据,按照device_attr中给定的指标进行数据聚合接口,返回值是PmuDeviceData + +* type PmuDataVo struct: PmuRead接口返回的原始数据 +* []PmuDeviceAttr: 指定需要聚合的指标参数 +* typ PmuDeviceDataVo struct: + * GoDeviceData []PmuDeviceData +* type PmuDeviceData struct: + * Metric C.enum_PmuDeviceMetric 采集的指标 + * Count float64 指标的计数值 + * Mode C.enum_PmuMetricMode 指标的采集类型,按core、按numa还是按bdf号 + * CoreId uint32 数据的core编号 + * NumaId uint32 数据的numa编号 + * ClusterId uint32 簇ID + * Bdf string 数据的bdf编号 + +### kperf.DevDataFree + +func DevDataFree(devVo PmuDeviceDataVo) 清理PmuDeviceData的指针数据 + +### kperf.PmuGetClusterCore + +func PmuGetClusterCore(clusterId uint) ([]uint, error) 查询指定clusterId下对应的core列表 + +* clusterId CPU的clusterId编号 +* 返回值:当前clusterId下对应的core列表,出现错误则列表为空,且error不为空 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + clusterId := uint(1) + coreList, err := kperf.PmuGetClusterCore(clusterId) + if err != nil { + fmt.Printf("kperf PmuGetClusterCore failed, expect err is nil, but is %v\n", err) + return + } + for _, v := range coreList { + fmt.Printf("coreId has:%v\n", v) + } +} +``` + +### kperf.PmuGetNumaCore + +func PmuGetNumaCore(nodeId uint) ([]uint, error) 查询指定numaId下对应的core列表 + +* nodeId numa对应的ID +* 返回值为对应numaId下的cpu core列表,出现错误则列表为空,且error不为空 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + nodeId := uint(0) + coreList, err := kperf.PmuGetNumaCore(nodeId) + if err != nil { + fmt.Printf("kperf PmuGetNumaCore failed, expect err is nil, but is %v\n", err) + return + } + for _, v := range coreList { + fmt.Printf("coreId has:%v\n", v) + } +} +``` + + +### kperf.PmuGetCpuFreq +func PmuGetCpuFreq(core uint) (int64, error) 查询当前系统指定core的实时CPU频率 + +* core cpu coreId +* 返回值为int64, 时当前cpu core的实时频率,出现错误频率为-1,且error不为空 + +```go +import "libkperf/kperf" +import "fmt" + +func main() { + coreId := uint(0) + freq, err := kperf.PmuGetCpuFreq(coreId) + if err != nil { + fmt.Printf("kperf PmuGetCpuFreq failed, expect err is nil, but is %v\n", err) + return + } + fmt.Printf("coreId %v freq is %v\n", coreId, freq) +} +``` \ No newline at end of file diff --git a/go/src/libkperf/kperf/kperf.go b/go/src/libkperf/kperf/kperf.go index cf10f71..2b5958c 100644 --- a/go/src/libkperf/kperf/kperf.go +++ b/go/src/libkperf/kperf/kperf.go @@ -308,13 +308,13 @@ type PmuAttr struct { CallStack bool // This indicates whether to collect whole callchains or only top frame DataFilter C.enum_SpeFilter // Spe Data Filter.Refer to comments of SpeFilter EvFilter C.enum_SpeEventFilter // Spe Event filter.Refer to comments of SpeEventFilter - MinLatency uint64 // Collect only smaples with latency or higher - IncludeNewFork bool // enable it you can get the new child thread count, only in couting mode - BranchSampleFilter uint64 // if the filering mode is set, branch_sample_stack data is collected in sampling mode - BlockedSample bool // This indicates whether the blocked sample mode is enabled. In this mode, both on Cpu and off Cpu data is collectd + MinLatency uint64 // Collect only samples with latency or higher + IncludeNewFork bool // enable it you can get the new child thread count, only in counting mode + BranchSampleFilter uint64 // if the filter mode is set, branch_sample_stack data is collected in sampling mode + BlockedSample bool // This indicates whether the blocked sample mode is enabled. In this mode, both on Cpu and off Cpu data is collected } -type CpuTopolopy struct { +type CpuTopology struct { CoreId int // cpu core id NumaId int // numa id SocketId int // socket id @@ -338,12 +338,12 @@ type PmuData struct { Ts uint64 // time stamp. uint: ns Pid int // process id Tid int // thread id - Cpu int // cpu id + Cpu int // cpu id Comm string // process command Period uint64 // sample period Count uint64 // event count. Only available for counting CountPercent float64 // event count Percent. when count = 0, countPercent = -1; Only available for counting - CpuTopo CpuTopolopy // cpu topolopy + CpuTopo CpuTopology // cpu topology Symbols []sym.Symbol // symbol list BranchRecords []BranchSampleRecord // branch record list SpeExt SpeDataExt // SPE data @@ -353,7 +353,7 @@ type PmuData struct { type PmuDataVo struct { GoData []PmuData // PmuData list - cData *C.struct_PmuData // Pointer to PmuData in inferface C + cData *C.struct_PmuData // Pointer to PmuData in interface C fd int // fd } @@ -374,10 +374,11 @@ type PmuTraceAttr struct { // PmuTraceData info type PmuTraceData struct { FuncName string // function name + StartTs int64 // start timestamp. uint: us ElapsedTime float64 // elapsed time Pid int // process id Tid int // thread id - Cpu int // cpu id + Cpu int // cpu id Comm string // process command } @@ -390,7 +391,7 @@ type PmuTraceDataVo struct { type PmuDeviceAttr struct { Metric C.enum_PmuDeviceMetric - // Used for PMU_PCIE_XXX and PMU_SMMU_XXX to collect a specifi pcie device. + // Used for PMU_PCIE_XXX and PMU_SMMU_XXX to collect a specific pcie device. // The string of bdf is something like '7a:01.0'. Bdf string } @@ -403,7 +404,7 @@ type PmuDeviceData struct { Mode C.enum_PmuMetricMode // Field of union depends on the above . CoreId uint32 // for percore metric NumaId uint32 // for pernuma metric - ClusterId uint32 // for percluster emtric + ClusterId uint32 // for percluster metric Bdf string // for perpcie metric } @@ -548,7 +549,7 @@ func PmuEventList(eventType C.enum_PmuEventType) []string { // Enable counting or sampling of task . // On success, nil is returned. // On error, error is returned. -// param pd task id +// param fd task id // return error func PmuEnable(fd int) error { rs := C.PmuEnable(C.int(fd)) @@ -561,7 +562,7 @@ func PmuEnable(fd int) error { // Disable counting or sampling of task . // On success, nil is returned. // On error, error is returned. -// param pd task id +// param fd task id // return err func PmuDisable(fd int) error { rs := C.PmuDisable(C.int(fd)) @@ -613,7 +614,7 @@ func PmuDataFree(data PmuDataVo) { // Close task with id // After PmuClose is called, all pmu data related to the task become invalid -// param pd task id +// param fd task id func PmuClose(fd int) { if fd <= 0 { return @@ -627,7 +628,7 @@ func PmuClose(fd int) { } // stop a sampling task in asynchronous mode -// param pd pmu descriptor. +// param fd pmu descriptor. func PmuStop(fd int) { if fd <= 0 { return @@ -641,7 +642,7 @@ func PmuStop(fd int) { // That is to say, for COUNTING, counts of all pmu event are reset to zero in PmuRead // For SAMPLING and SPE_SAMPLING, samples collected are started from the last PmuEnable or PmuRead // On success, PmuDataVo is returned -// param pd task id +// param fd task id // return PmuDataVo and error func PmuRead(fd int) (PmuDataVo, error) { pmuDataVo := PmuDataVo{} @@ -762,7 +763,7 @@ func PmuTraceOpen(traceType C.enum_PmuTraceType, traceAttr PmuTraceAttr) (int, e // Enable trace collection of task // On success, nil is returned. // On error, -1 is returned. -// param pd trace collect task id +// param taskId trace collect task id // return error code func PmuTraceEnable(taskId int) error { rs := C.PmuTraceEnable(C.int(taskId)) @@ -775,7 +776,7 @@ func PmuTraceEnable(taskId int) error { // Disable trace collection of task // On success, nil is returned // On error, error is returned -// param pd trace collect task id +// param taskId trace collect task id // return error code func PmuTraceDisable(taskId int) error { rs := C.PmuTraceDisable(C.int(taskId)) @@ -788,7 +789,7 @@ func PmuTraceDisable(taskId int) error { // Collect data. // Pmu trace data are collected starting from the last PmuTraceEnable or PmuTraceRead // On success, PmuTraceDataVo is returned -// param pd trace collect task id +// param taskId trace collect task id // param PmuTraceDataVo pmu trace data // return PmuTraceDataVo and error func PmuTraceRead(taskId int) (PmuTraceDataVo, error) { @@ -812,7 +813,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) { cDataList := *(*[]C.struct_PmuTraceData)(unsafe.Pointer(&slice)) goTraceData := make([]PmuTraceData, int(traceLen)) for i, v := range cDataList { - goTraceData[i] = PmuTraceData{FuncName:C.GoString(v.funcs), ElapsedTime:float64(v.elapsedTime), Pid:int(v.pid), Tid: int(v.tid), Cpu: int(v.cpu), Comm: C.GoString(v.comm)} + goTraceData[i] = PmuTraceData{FuncName:C.GoString(v.funcs), StartTs: int64(v.startTs), ElapsedTime:float64(v.elapsedTime), Pid:int(v.pid), Tid: int(v.tid), Cpu: int(v.cpu), Comm: C.GoString(v.comm)} } res.GoTraceData = goTraceData res.cTraceData = cTraceData @@ -821,7 +822,7 @@ func PmuTraceRead(taskId int) (PmuTraceDataVo, error) { // Close task with id . // After PmuTraceClose is called, all pmu trace data related to the task become invalid -// param collect task id +// param taskId task id func PmuTraceClose(taskId int) { C.PmuTraceClose(C.int(taskId)) } @@ -926,7 +927,11 @@ func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) { cAttr := make([]C.struct_PmuDeviceAttr, len(attr)) for i, v := range attr { cAttr[i].metric = v.Metric - cAttr[i].bdf = C.CString(v.Bdf) + if len(v.Bdf) > 0 { + cAttr[i].bdf = C.CString(v.Bdf) + } else { + cAttr[i].bdf = nil + } } deviceTaskId := C.PmuDeviceOpen(&cAttr[0], C.uint(len(attr))) if int(deviceTaskId) == -1 { @@ -947,7 +952,11 @@ func PmuGetDevMetric(dataVo PmuDataVo, deviceAttr []PmuDeviceAttr) (PmuDeviceDat cAttr := make([]C.struct_PmuDeviceAttr, len(deviceAttr)) for i, v := range deviceAttr { cAttr[i].metric = v.Metric - cAttr[i].bdf = C.CString(v.Bdf) + if len(v.Bdf) > 0 { + cAttr[i].bdf = C.CString(v.Bdf) + } else { + cAttr[i].bdf = nil + } } metricLen := C.int(0) metricData := C.IPmuGetMetric(dataVo.cData, C.uint(len(dataVo.GoData)), &cAttr[0], C.uint(len(deviceAttr)), &metricLen) @@ -1071,7 +1080,7 @@ func transferCPmuDataToGoData(cPmuData *C.struct_PmuData, dataLen int, fd int) [ goDatas[i].CountPercent = float64(dataObj.countPercent) goDatas[i].Cpu = int(dataObj.cpu) if dataObj.cpuTopo != nil { - goDatas[i].CpuTopo = CpuTopolopy{CoreId: int(dataObj.cpuTopo.coreId), NumaId: int(dataObj.cpuTopo.numaId), SocketId: int(dataObj.cpuTopo.socketId)} + goDatas[i].CpuTopo = CpuTopology{CoreId: int(dataObj.cpuTopo.coreId), NumaId: int(dataObj.cpuTopo.numaId), SocketId: int(dataObj.cpuTopo.socketId)} } if dataObj.ext != nil { diff --git a/go/src/libkperf_test/libkperf_test.go b/go/src/libkperf_test/libkperf_test.go index a0343d5..2f55951 100644 --- a/go/src/libkperf_test/libkperf_test.go +++ b/go/src/libkperf_test/libkperf_test.go @@ -157,7 +157,7 @@ func TestSysCallTrace(t *testing.T) { t.Logf("==========================pmu get trace data success==========================") for _, v := range traceList.GoTraceData { - t.Logf("comm=%v, func=%v, elapsedTime=%v, pid=%v, tid=%v, cpu=%v", v.Comm, v.FuncName, v.ElapsedTime, v.Pid, v.Tid, v.Cpu) + t.Logf("comm=%v, func=%v, elapsedTime=%v, startTs=%v, pid=%v, tid=%v, cpu=%v", v.Comm, v.FuncName, v.ElapsedTime, v.StartTs, v.Pid, v.Tid, v.Cpu) } kperf.PmuTraceFree(traceList) -- Gitee From 389c9bbc41a1a714ec7a64f98b3b26e48a875155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Wed, 23 Apr 2025 10:04:36 +0800 Subject: [PATCH 06/14] fix blockedSample bugs --- example/pmu_hotspot.cpp | 5 ++++- example/pmu_hotspot.go | 34 +++++++++++++++++++--------------- example/pmu_hotspot.py | 5 ++++- pmu/pmu_list.cpp | 19 ++++++++----------- pmu/pmu_metric.h | 2 +- 5 files changed, 36 insertions(+), 29 deletions(-) diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index 9487368..cdfeee3 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -111,7 +111,7 @@ int GetPmuDataHotspot(PmuData* pmuData, int pmuDataLen, std::vector& tm std::string GetPeriodPercent(uint64_t period) { std::ostringstream oss; - oss << std::fixed << std::setprecision(FLOAT_PRECISION) << (static_cast(period) * 100 / g_totalPeriod); + oss << std::fixed << std::setprecision(FLOAT_PRECISION) << (static_cast(period) / g_totalPeriod * 100.0); return oss.str(); } @@ -209,6 +209,7 @@ void BlockedSample(int pid, double interval, int count, bool blockedSample) std::cerr << "error msg:" << Perror() << std::endl; return; } + g_totalPeriod = 0; std::vector hotSpotData; GetPmuDataHotspot(pmuData, len, hotSpotData); PrintHotSpotGraph(hotSpotData); @@ -219,6 +220,7 @@ void BlockedSample(int pid, double interval, int count, bool blockedSample) for (int i = 0; i < hotSpotData.size(); ++i) { PrintStack(hotSpotData[i].stack, 0, hotSpotData[i].period); } + g_totalPeriod = 0; PmuDataFree(pmuData); } PmuDisable(pd); @@ -253,6 +255,7 @@ void print_usage() { std::cerr << " process name: process path or input process number\n"; std::cerr << " example: pmu_hotspot 0.1 10 0 ./process\n"; std::cerr << " example: pmu_hotspot 1 100 1 ./process\n"; + std::cerr << " example: pmu_hotspot 1 100 1 \n"; } int main(int argc, char** argv) diff --git a/example/pmu_hotspot.go b/example/pmu_hotspot.go index fd8cc73..ef3c1bd 100644 --- a/example/pmu_hotspot.go +++ b/example/pmu_hotspot.go @@ -1,17 +1,18 @@ -""" -Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. -libkperf licensed under the Mulan PSL v2. -You can use this software according to the terms and conditions of the Mulan PSL v2. -You may obtain a copy of Mulan PSL v2 at: - http://license.coscl.org.cn/MulanPSL2 -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -PURPOSE. -See the Mulan PSL v2 for more details. -Author: Mr.Li -Create: 2025-04-09 -Description: Analyze the original data of performance monitoring unit, and compute the hotspot data. -""" +/****************************************************************************** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * libkperf licensed under the Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR + * PURPOSE. + * See the Mulan PSL v2 for more details. + * Author: Mr.Li + * Create: 2025-04-09 + * Description: Pmu data hotspot analysis module. + * Current capability: Analyze the original data of performance monitoring unit, and compute the hotspot data. + ******************************************************************************/ package main import "os" @@ -35,6 +36,7 @@ func printUsage() { fmt.Println(" process name: process path or input process number") fmt.Println(" example: ./pmu_hotspot_of_go 0.1 10 0 ./process") fmt.Println(" example: ./pmu_hotspot_of_go 1 100 1 ./process") + fmt.Println(" example: ./pmu_hotspot_of_go 1 100 1 ") } var GlobalPeriod uint64 = 0 @@ -118,7 +120,7 @@ func GetPmuDataHotSpot(vo kperf.PmuDataVo) []kperf.PmuData { } func getPeriodPercent(period uint64) float64 { - return float64(period) * 100.00 / float64(GlobalPeriod) + return float64(period) / float64(GlobalPeriod) * 100.00 } func printHotSpotGraph(hotspotData []kperf.PmuData) { @@ -208,6 +210,7 @@ func blockSample(pid int, interval float64, count int, blockedSample int) { return } + GlobalPeriod = 0 hotspotData := GetPmuDataHotSpot(pmuDataVo) printHotSpotGraph(hotspotData) fmt.Printf(strings.Repeat("=", 50) + "Print the call stack of the hotspot function" + strings.Repeat("=", 50) + "\n") @@ -215,6 +218,7 @@ func blockSample(pid int, interval float64, count int, blockedSample int) { for _, data := range hotspotData { printStack(data.Symbols, data.Period) } + GlobalPeriod = 0 } kperf.PmuDisable(fd) kperf.PmuClose(fd) diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index fc76c3a..eea4b58 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -79,7 +79,7 @@ def get_pmu_data_hotspot(pmu_data, tmp_data): def get_period_percent(period): - return f"{(period * 100 / g_total_period):.{FLOAT_PRECISION}f}" + return f"{(period / g_total_period * 100.0):.{FLOAT_PRECISION}f}" def print_stack(stack, depth=0, period=0): @@ -157,6 +157,7 @@ def blocked_sample(pid, interval, count, blockedSample): if pmu_data == -1: print(f"read failed, error msg: {kperf.error()}") return + g_total_period = 0 hotspot_data = [] get_pmu_data_hotspot(pmu_data, hotspot_data) print_hotspot_graph(hotspot_data) @@ -164,6 +165,7 @@ def blocked_sample(pid, interval, count, blockedSample): print(f"{'@symbol':<40}{'@module':<40}{'@percent':>40}") for data in hotspot_data: print_stack(data.stack, 0, data.period) + g_total_period = 0 err = kperf.disable(pd) if err != 0: print(f"disable failed, error msg: {kperf.error()}") @@ -192,6 +194,7 @@ def print_usage(): print(" process name: process path or input process number") print(" example: python3 pmu_hotspot.py 0.1 10 0 ./process") print(" example: python3 pmu_hotspot.py 1 100 1 ./process") + print(" example: python3 pmu_hotspot.py 1 100 1 ") def main(): pid = 0 diff --git a/pmu/pmu_list.cpp b/pmu/pmu_list.cpp index aa38b70..360b875 100644 --- a/pmu/pmu_list.cpp +++ b/pmu/pmu_list.cpp @@ -309,7 +309,7 @@ namespace KUNPENG_PMU { if (item.swOut) { outTime = item.ts; prevTid = item.tid; - DBG_PRINT("Switch out: tid=%d, ts=%ld\n", item.tid, item.ts); + DBG_PRINT("Switch out: tid=%d, ts=%llu\n", item.tid, item.ts); } else { // if the first event is sched_in, we need to ignore it. if (prevTid == -1) { @@ -318,7 +318,7 @@ namespace KUNPENG_PMU { } if (prevTid == item.tid && outTime > 0) { tidToOffTimeStamps[item.tid].emplace_back(item.ts - outTime); - DBG_PRINT("Switch in: tid=%d, ts=%ld, offTime=%ld\n", item.tid, item.ts, item.ts - outTime); + DBG_PRINT("Switch in: tid=%d, ts=%llu, offTime=%llu\n", item.tid, item.ts, item.ts - outTime); outTime = 0; } } @@ -350,24 +350,21 @@ namespace KUNPENG_PMU { TrimKernelStack(item); // Before the context-switches event, there is only one cycles event, which we need to ignore. if (currentTs == 0) { + currentTs = item.ts; DBG_PRINT("Ignoring first cycles event for tid=%d\n", item.tid); continue; } - // only the on cpu event is cycles or cpu-clock, this compute is right. + // only the on cpu event is cycles, this compute is right. if (csCnt < tidToOffTimeStamps[item.tid].size()) { item.period = tidToOffTimeStamps[item.tid][csCnt] * curPeriod / (currentTs - prevTs); - DBG_PRINT("Context switch: tid=%d, period=%ld\n", item.tid, item.period); + DBG_PRINT("Context switch: ts=%llu, tid=%d, period=%llu\n", item.ts, item.tid, item.period); csCnt++; } } else { // on cpu event data update. - if (prevTs == 0) { - prevTs = item.ts; - } else { - prevTs = currentTs; - currentTs = item.ts; - curPeriod = item.period; - } + prevTs = currentTs; + currentTs = item.ts; + curPeriod = item.period; } } } diff --git a/pmu/pmu_metric.h b/pmu/pmu_metric.h index 593f97d..eae8524 100644 --- a/pmu/pmu_metric.h +++ b/pmu/pmu_metric.h @@ -9,7 +9,7 @@ * PURPOSE. * See the Mulan PSL v2 for more details. * Author: Mr.Lei - * Create: 2024-04-17 + * Create: 2025-04-17 * Description: definition of pmu metric some interface for free ******************************************************************************/ -- Gitee From f19c70c1760bc56006787697fe80f887aca4919d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Wed, 23 Apr 2025 17:51:29 +0800 Subject: [PATCH 07/14] fix for not souppot cpu info --- include/pcerrc.h | 1 + pmu/pmu_metric.cpp | 10 +++++++++- python/modules/kperf/perror.py | 1 + test/test_perf/test_metric.cpp | 1 + 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/pcerrc.h b/include/pcerrc.h index 55a2f9e..767a48b 100644 --- a/include/pcerrc.h +++ b/include/pcerrc.h @@ -106,6 +106,7 @@ extern "C" { #define LIBPERF_ERR_CPUFREQ_NOT_CONFIG 1062 #define LIBPERF_ERR_CLUSTER_ID_OVERSIZE 1063 #define LIBPERF_ERR_INVALID_PMU_BDF_TYPE 1064 +#define LIBPERF_ERR_NOT_SUPPORT_METRIC 1065 #define UNKNOWN_ERROR 9999 diff --git a/pmu/pmu_metric.cpp b/pmu/pmu_metric.cpp index 6dffb3b..a2163a1 100644 --- a/pmu/pmu_metric.cpp +++ b/pmu/pmu_metric.cpp @@ -298,7 +298,11 @@ namespace KUNPENG_PMU { static const map GetDeviceMtricConfig() { - return UNCORE_METRIC_CONFIG_MAP.at(GetCpuType()); + CHIP_TYPE chipType = GetCpuType(); + if (UNCORE_METRIC_CONFIG_MAP.find(chipType) == UNCORE_METRIC_CONFIG_MAP.end()) { + return {}; + } + return UNCORE_METRIC_CONFIG_MAP.at(chipType); } static int QueryUncoreRawDevices() @@ -757,6 +761,10 @@ namespace KUNPENG_PMU { static int CheckDeviceMetricEnum(PmuDeviceMetric metric) { const auto& metricConfig = GetDeviceMtricConfig(); + if (metricConfig.empty()) { + New(LIBPERF_ERR_NOT_SUPPORT_METRIC, "The current platform cpu does not support uncore metric collection."); + return LIBPERF_ERR_NOT_SUPPORT_METRIC; + } if (metricConfig.find(metric) == metricConfig.end()) { New(LIBPERF_ERR_INVALID_PMU_DEVICES_METRIC, "For this platform this metric " + GetMetricString(metric) + " is invalid value for PmuDeviceMetric!"); diff --git a/python/modules/kperf/perror.py b/python/modules/kperf/perror.py index 5183196..bc8c8e2 100644 --- a/python/modules/kperf/perror.py +++ b/python/modules/kperf/perror.py @@ -105,6 +105,7 @@ class Error: LIBPERF_ERR_CPUFREQ_NOT_CONFIG = 1062 LIBPERF_ERR_CLUSTER_ID_OVERSIZE = 1063 LIBPERF_ERR_INVALID_PMU_BDF_TYPE = 1064 + LIBPERF_ERR_NOT_SUPPORT_METRIC = 1065 UNKNOWN_ERROR = 9999 diff --git a/test/test_perf/test_metric.cpp b/test/test_perf/test_metric.cpp index 85b8653..56bee10 100644 --- a/test/test_perf/test_metric.cpp +++ b/test/test_perf/test_metric.cpp @@ -107,6 +107,7 @@ TEST_F(TestMetric, CollectDDRBandwidth) PmuDeviceAttr devAttr = {}; devAttr.metric = PMU_DDR_READ_BW; int pd = PmuDeviceOpen(&devAttr, 1); + cout << Perror() << endl; ASSERT_NE(pd, -1); PmuEnable(pd); sleep(1); -- Gitee From 58942c131bd4a25ef2dd624f5a5a214328cd87e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Mon, 28 Apr 2025 15:53:10 +0800 Subject: [PATCH 08/14] add blocked_sample case process, opt stack only print top 10 --- example/case/blocked_sample_case.cpp | 465 +++++++++++++++++++++++++++ example/pmu_hotspot.cpp | 6 +- example/pmu_hotspot.go | 8 +- example/pmu_hotspot.py | 5 +- 4 files changed, 479 insertions(+), 5 deletions(-) create mode 100644 example/case/blocked_sample_case.cpp diff --git a/example/case/blocked_sample_case.cpp b/example/case/blocked_sample_case.cpp new file mode 100644 index 0000000..4db89e7 --- /dev/null +++ b/example/case/blocked_sample_case.cpp @@ -0,0 +1,465 @@ +/* + * Optimization Notes: + * + * This program constructs a multi-threaded task, where each task consists of three phases: + * + * 1. on-CPU computation: + * Two modes are provided: + * - inefficient: Simulates inefficient computation using heavy floating-point operations (default). + * - efficient: Uses integers instead of floating-point numbers for optimized computation + * (though more efficient, overall time remains almost unchanged as off-CPU phase (synchronous IO) is the bottleneck). + * + * 2. IO operation phase: + * Three modes are provided: + * - global: Write to a single file protected by a global lock (baseline). + * - split: Each thread writes to its own file (reduces lock contention). + * - async: Asynchronous IO, enqueues data for background batch writing (previous version lacked batching, causing worse performance). + * + * 3. Supplemental on-CPU computation. + * + * Usage (command-line argument order): + * [numThreads] [tasksPerThread] [cpuIterations] [ioDataSize] [ioWrites] [ioMode] [onCpuMode] + * + * Example (your given test parameters, plus onCpuMode parameter): + * ./blocked_sample_io 4 50 100000 5000 3000 global inefficient + * + * Where: + * ioMode: global|split|async + * onCpuMode: inefficient (inefficient implementation) or efficient (optimized implementation) + * + * Note: If the user attempts to optimize the CPU computation part using the efficient on-CPU mode, + * the overall runtime remains almost unchanged, proving that the bottleneck lies mainly in the off-CPU part (synchronous IO and lock contention). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +using namespace std::chrono; + +// Define IO mode enumeration +enum class IOMode { GLOBAL, SPLIT, ASYNC }; +IOMode currentIOMode = IOMode::GLOBAL; // Default IO mode + +//------------------------------------------------------------- +// on-CPU simulation: Implementation of two computation methods +//------------------------------------------------------------- +// Inefficient CPU work: Heavy loop computation to prevent compiler optimization +void doOnCpuWorkInefficient(int iterations) { + volatile double dummy = 1.0; + for (int i = 0; i < iterations; i++) { + dummy = dummy * 1.000001 + 0.000001; + } + (void)dummy; +} + +// Efficient CPU work: Use integers to simulate decimals for optimized computation +void doOnCpuWorkEfficient(int iterations) { + long long dummy = 1000000; // Use integers to simulate decimals, assuming precision of 1e-6 + for (int i = 0; i < iterations; i++) { + dummy = dummy * 1000001 / 1000000 + 1; + } + (void)dummy; +} + +// Global flag to decide which on-CPU computation method to use (default is inefficient) +bool efficientOnCpu = false; + +// Encapsulated on-CPU work interface, calls corresponding implementation based on efficientOnCpu +void doOnCpuWork(int iterations) { + if (efficientOnCpu) { + doOnCpuWorkEfficient(iterations); + } else { + doOnCpuWorkInefficient(iterations); + } +} + +//------------------------------------------------------------- +// GLOBAL mode: Global file and mutex +//------------------------------------------------------------- +mutex globalFileMutex; +ofstream globalSyncFile; // Global file + +//------------------------------------------------------------- +// Asynchronous IO Manager (optimized): Batch writing to reduce flush frequency +//------------------------------------------------------------- +class AsyncIOManager { +private: + queue msgQueue; + mutex mtx; + condition_variable cv; + atomic stop; + thread worker; + ofstream outFile; + const size_t batchSize; // Number of messages written per batch + +public: + AsyncIOManager(const string& filename, size_t batchSize = 50) + : stop(false), batchSize(batchSize) + { + outFile.open(filename, ios::out | ios::trunc); + if (!outFile.is_open()){ + cerr << "Failed to open file: " << filename << endl; + } + worker = thread([this]() { this->process(); }); + } + + ~AsyncIOManager(){ + { + lock_guard lock(mtx); + stop = true; + } + cv.notify_one(); + if(worker.joinable()){ + worker.join(); + } + if(outFile.is_open()){ + outFile.close(); + } + } + + // Push message to be written into the queue + void push(const string &msg) { + { + lock_guard lock(mtx); + msgQueue.push(msg); + } + cv.notify_one(); + } + +private: + // Background thread processes batch writes + void process() { + while (true) { + vector localBatch; + { + unique_lock lock(mtx); + cv.wait(lock, [this]() { return stop || !msgQueue.empty(); }); + while (!msgQueue.empty() && localBatch.size() < batchSize) { + localBatch.push_back(msgQueue.front()); + msgQueue.pop(); + } + if (stop && localBatch.empty()) { + break; + } + } + // Merge and write batch, then flush + if (outFile.is_open()) { + string batchStr; + for (const auto &msg : localBatch) { + batchStr.append(msg); + } + outFile << batchStr; + outFile.flush(); + } + } + } +}; + +AsyncIOManager *asyncIO = nullptr; // Global pointer to asynchronous IO manager + +//------------------------------------------------------------- +// Thread Pool: Manages worker threads and task queue +//------------------------------------------------------------- +class ThreadPool { +public: + ThreadPool(size_t threads); + ~ThreadPool(); + void enqueue(function task); + void wait(); + +private: + vector workers; + queue> tasks; + mutex queue_mutex; + condition_variable condition; + atomic stop; + atomic active_tasks; + condition_variable cv_finished; +}; + +ThreadPool::ThreadPool(size_t threads) : stop(false), active_tasks(0) { + for (size_t i = 0; i < threads; i++) { + workers.emplace_back([this, i]() { + while (true) { + function task; + { + unique_lock lock(this->queue_mutex); + this->condition.wait(lock, [this]() { + return this->stop.load() || !this->tasks.empty(); + }); + if (this->stop.load() && this->tasks.empty()) + return; + task = move(this->tasks.front()); + this->tasks.pop(); + active_tasks++; + } + task(); + { + lock_guard lock(this->queue_mutex); + active_tasks--; + if (tasks.empty() && active_tasks == 0) { + cv_finished.notify_all(); + } + } + } + }); + } +} + +ThreadPool::~ThreadPool() { + { + lock_guard lock(queue_mutex); + stop.store(true); + } + condition.notify_all(); + for (thread &worker : workers) { + if (worker.joinable()) + worker.join(); + } +} + +void ThreadPool::enqueue(function task) { + { + lock_guard lock(queue_mutex); + tasks.push(move(task)); + } + condition.notify_one(); +} + +void ThreadPool::wait() { + unique_lock lock(queue_mutex); + cv_finished.wait(lock, [this]() { + return tasks.empty() && active_tasks == 0; + }); +} + +//------------------------------------------------------------- +// Helper functions: Print divider and usage instructions +//------------------------------------------------------------- +void printDivider() { + cout << string(60, '-') << endl; +} + +void printUsage(const char* programName) { + cout << "Usage: " << programName << " [numThreads] [tasksPerThread] [cpuIterations] [ioDataSize] [ioWrites] [ioMode] [onCpuMode]" << endl; + cout << " numThreads: Number of worker threads (default: 4)" << endl; + cout << " tasksPerThread: Number of tasks per thread (default: 50)" << endl; + cout << " cpuIterations: Number of on-CPU computation iterations (default: 100000)" << endl; + cout << " ioDataSize: Number of characters written per synchronous IO operation (default: 5000)" << endl; + cout << " ioWrites: Number of IO operations per task (default: 3000)" << endl; + cout << " ioMode: IO mode, options: global, split, async (default: global)" << endl; + cout << " onCpuMode: on-CPU mode, options: inefficient, efficient (default: inefficient)" << endl; +} + +//------------------------------------------------------------- +// GLOBAL mode IO operation: Write to global file with global lock +//------------------------------------------------------------- +void doGlobalIOWork(int taskId, int ioDataSize, int ioWrites) { + stringstream ss; + ss << "Task " << taskId << " data: "; + for (int i = 0; i < ioDataSize; i++) { + ss << "X"; + } + ss << "\n"; + string data = ss.str(); + for (int i = 0; i < ioWrites; i++) { + { + lock_guard lock(globalFileMutex); + if (globalSyncFile.is_open()) { + globalSyncFile << data; + globalSyncFile.flush(); + } + } + doOnCpuWork(1000); + } +} + +//------------------------------------------------------------- +// SPLIT mode IO operation: Each thread writes to its own file +//------------------------------------------------------------- +void doSplitIOWork(int taskId, int ioDataSize, int ioWrites) { + stringstream ss; + ss << "Task " << taskId << " data: "; + for (int i = 0; i < ioDataSize; i++) { + ss << "X"; + } + ss << "\n"; + string data = ss.str(); + static thread_local ofstream localFile; + static thread_local bool initialized = false; + if (!initialized) { + auto tid = this_thread::get_id(); + hash hasher; + size_t id_hash = hasher(tid); + string filename = "split_output_" + to_string(id_hash) + ".txt"; + localFile.open(filename, ios::out | ios::trunc); + if (!localFile.is_open()) { + cerr << "Failed to open file: " << filename << endl; + } + initialized = true; + } + for (int i = 0; i < ioWrites; i++) { + localFile << data; + localFile.flush(); + doOnCpuWork(1000); + } +} + +//------------------------------------------------------------- +// ASYNC mode IO operation: Push data into asynchronous queue +//------------------------------------------------------------- +void doAsyncIOWork(int taskId, int ioDataSize, int ioWrites) { + stringstream ss; + ss << "Task " << taskId << " data: "; + for (int i = 0; i < ioDataSize; i++) { + ss << "X"; + } + ss << "\n"; + string data = ss.str(); + for (int i = 0; i < ioWrites; i++) { + if (asyncIO) { + asyncIO->push(data); + } + doOnCpuWork(1000); + } +} + +//------------------------------------------------------------- +// Task processing: on-CPU computation -> IO operation -> small amount of on-CPU computation +//------------------------------------------------------------- +void processTask(int taskId, int cpuIterations, int ioDataSize, int ioWrites) { + // Phase 1: on-CPU computation (choose implementation based on onCpuMode) + doOnCpuWork(cpuIterations); + + // Phase 2: IO operation, choose execution method based on current IO mode + if (currentIOMode == IOMode::GLOBAL) { + doGlobalIOWork(taskId, ioDataSize, ioWrites); + } else if (currentIOMode == IOMode::SPLIT) { + doSplitIOWork(taskId, ioDataSize, ioWrites); + } else if (currentIOMode == IOMode::ASYNC) { + doAsyncIOWork(taskId, ioDataSize, ioWrites); + } + + // Phase 3: Small amount of additional on-CPU computation + doOnCpuWork(cpuIterations / 10); +} + +//------------------------------------------------------------- +// main function: Parse arguments, initialize IO & on-CPU modes, start thread pool, and measure elapsed time +//------------------------------------------------------------- +int main(int argc, char* argv[]) { + // Default parameters + int numThreads = 4; + int tasksPerThread = 50; + int cpuIterations = 100000; + int ioDataSize = 5000; + int ioWrites = 3000; + string ioModeStr = "global"; // Default IO mode + string onCpuModeStr = "inefficient"; // Default on-CPU mode + + // Argument check and help information + if (argc > 1) { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) { + printUsage(argv[0]); + return 0; + } + } + if (argc > 1) { numThreads = atoi(argv[1]); } + if (argc > 2) { tasksPerThread = atoi(argv[2]); } + if (argc > 3) { cpuIterations = atoi(argv[3]); } + if (argc > 4) { ioDataSize = atoi(argv[4]); } + if (argc > 5) { ioWrites = atoi(argv[5]); } + if (argc > 6) { ioModeStr = argv[6]; } + if (argc > 7) { onCpuModeStr = argv[7]; } + + // Determine current IO mode based on ioMode parameter + if (ioModeStr == "global") { + currentIOMode = IOMode::GLOBAL; + cout << "Using GLOBAL mode: Writing to global file with global mutex protection" << endl; + } else if (ioModeStr == "split") { + currentIOMode = IOMode::SPLIT; + cout << "Using SPLIT mode: Each thread writes to its own file, reducing lock granularity" << endl; + } else if (ioModeStr == "async") { + currentIOMode = IOMode::ASYNC; + cout << "Using ASYNC mode: Asynchronous IO, background thread performs batch writes" << endl; + } else { + cout << "Unknown IO mode, defaulting to GLOBAL mode" << endl; + currentIOMode = IOMode::GLOBAL; + } + + // Determine on-CPU mode based on onCpuMode parameter + if (onCpuModeStr == "efficient") { + efficientOnCpu = true; + cout << "Using efficient on-CPU implementation" << endl; + } else { + efficientOnCpu = false; + cout << "Using inefficient on-CPU implementation (default)" << endl; + } + + int totalTasks = numThreads * tasksPerThread; + printDivider(); + cout << "Program configuration:" << endl; + cout << " Number of worker threads (numThreads): " << numThreads << endl; + cout << " Number of tasks per thread (tasksPerThread): " << tasksPerThread << endl; + cout << " Total number of tasks: " << totalTasks << endl; + cout << " On-CPU computation iterations (cpuIterations): " << cpuIterations << endl; + cout << " Characters written per IO operation (ioDataSize): " << ioDataSize << endl; + cout << " Number of IO operations per task (ioWrites): " << ioWrites << endl; + cout << " IO mode (ioMode): " << ioModeStr << endl; + cout << " on-CPU mode (onCpuMode): " << onCpuModeStr << endl; + printDivider(); + + // Perform necessary initialization based on IO mode + if (currentIOMode == IOMode::GLOBAL) { + globalSyncFile.open("global_output.txt", ios::out | ios::trunc); + if (!globalSyncFile.is_open()){ + cerr << "Failed to open global_output.txt file. Please check permissions or path." << endl; + return 1; + } + } else if (currentIOMode == IOMode::ASYNC) { + asyncIO = new AsyncIOManager("async_output.txt", 50); + } + + // Create thread pool, distribute tasks, and measure total elapsed time + ThreadPool pool(numThreads); + auto startTime = high_resolution_clock::now(); + for (int i = 0; i < totalTasks; i++) { + pool.enqueue([=]() { + processTask(i, cpuIterations, ioDataSize, ioWrites); + }); + } + pool.wait(); + auto endTime = high_resolution_clock::now(); + duration elapsed = endTime - startTime; + + // Cleanup resources + if (currentIOMode == IOMode::GLOBAL) { + globalSyncFile.close(); + } else if (currentIOMode == IOMode::ASYNC) { + delete asyncIO; + asyncIO = nullptr; + } + + printDivider(); + cout << "Completed " << totalTasks << " tasks in " + << fixed << setprecision(2) << elapsed.count() << " seconds." << endl; + cout << "Current IO mode: " << ioModeStr << ", on-CPU mode: " << onCpuModeStr << endl; + cout << "Optimization direction: Reducing lock granularity/scattered writes or adopting batch asynchronous IO can effectively alleviate off-CPU bottlenecks;" << endl; + cout << " Even with an efficient on-CPU implementation, there will be no significant impact on overall runtime." << endl; + printDivider(); + + return 0; +} \ No newline at end of file diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index cdfeee3..dadf584 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -217,7 +217,11 @@ void BlockedSample(int pid, double interval, int count, bool blockedSample) std::cout << std::string(50, '=') << std::endl; std::cout << std::setw(40) << "@symbol" << std::setw(40) << "@module"; std::cout << std::setw(40) << std::right << "@percent" << std::endl; - for (int i = 0; i < hotSpotData.size(); ++i) { + int stackLen = hotSpotData.size(); + if (stackLen > 10) { + stackLen = 10; // Only print top 10 hotspots stack. + } + for (int i = 0; i < stackLen; ++i) { PrintStack(hotSpotData[i].stack, 0, hotSpotData[i].period); } g_totalPeriod = 0; diff --git a/example/pmu_hotspot.go b/example/pmu_hotspot.go index ef3c1bd..8fc5d59 100644 --- a/example/pmu_hotspot.go +++ b/example/pmu_hotspot.go @@ -215,8 +215,12 @@ func blockSample(pid int, interval float64, count int, blockedSample int) { printHotSpotGraph(hotspotData) fmt.Printf(strings.Repeat("=", 50) + "Print the call stack of the hotspot function" + strings.Repeat("=", 50) + "\n") fmt.Printf("% -40s%-40s%+40s\n", "@symbol", "@module", "@percent") - for _, data := range hotspotData { - printStack(data.Symbols, data.Period) + stackLen := len(hotspotData) + if stackLen > 10 { + stackLen = 10 + } + for i := 0; i < stackLen; i++ { + printStack(hotspotData[i].Symbols, hotspotData[i].Period) } GlobalPeriod = 0 } diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index eea4b58..61619f4 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -163,8 +163,9 @@ def blocked_sample(pid, interval, count, blockedSample): print_hotspot_graph(hotspot_data) print("=" * 50 + "Print the call stack of the hotspot function" + "=" * 50) print(f"{'@symbol':<40}{'@module':<40}{'@percent':>40}") - for data in hotspot_data: - print_stack(data.stack, 0, data.period) + stack_len = min(10, len(hotspot_data)) + for i in range(stack_len): + print_stack(hotspot_data[i].stack, 0, hotspot_data[i].period) g_total_period = 0 err = kperf.disable(pd) if err != 0: -- Gitee From 7a8b2487c99d19d607fdd91b2ebd1bbade6b33c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Mon, 28 Apr 2025 19:39:56 +0800 Subject: [PATCH 09/14] add ddrc case --- example/case/ddrc_case.cpp | 31 +++++++++++++++++++++++++++++++ example/pmu_hotspot.cpp | 7 +++++-- example/pmu_hotspot.py | 5 ++++- python/modules/_libkperf/Pmu.py | 6 +++--- 4 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 example/case/ddrc_case.cpp diff --git a/example/case/ddrc_case.cpp b/example/case/ddrc_case.cpp new file mode 100644 index 0000000..7b8c143 --- /dev/null +++ b/example/case/ddrc_case.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include + +#define ARRAY_SIZE (1024 * 1024 * 512) // 512MB, ensuring it exceeds L3 cache +#define STRIDE 64 // Memory access stride (simulating cache line access) + +void memory_read_test(std::vector &array) { + volatile int sum = 0; // Prevent compiler optimization + auto start = std::chrono::high_resolution_clock::now(); + + while (true) { // Infinite loop + for (size_t i = 0; i < array.size(); i += STRIDE) { + sum += array[i]; // Memory access operation + } + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + double readCnt = (array.size() * sizeof(int)) / (elapsed.count() * 1e9); // GB/s + + std::cout << "Data throughput: " << readCnt << " GB/s" << std::endl; + start = end; // Reset timer + } +} + +int main() { + std::vector memory_array(ARRAY_SIZE, 1); // Initialize a large array + memory_read_test(memory_array); + return 0; +} \ No newline at end of file diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index dadf584..52d9e5d 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -272,6 +272,7 @@ int main(int argc, char** argv) int count = 0; bool blockedSample = false; int pid = 0; + bool needKill = false; try { interval = std::stod(argv[1]); if (interval <= 0) { @@ -289,6 +290,7 @@ int main(int argc, char** argv) pid = std::stoi(argv[4]); } catch (const std::invalid_argument&) { StartProc(argv[4], pid); + needKill = true; } } catch (const std::exception& e) { std::cerr << "Error parsing arguments: " << e.what() << "\n"; @@ -296,7 +298,8 @@ int main(int argc, char** argv) return EXIT_FAILURE; } BlockedSample(pid, interval, count, blockedSample); - EndProc(pid); - + if (needKill == true) { + EndProc(pid); + } return 0; } \ No newline at end of file diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index 61619f4..29b075c 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -215,17 +215,20 @@ def main(): if blockedSample not in (0, 1): raise ValueError("BlockedSample must be 0 or 1.") + need_kill = False try: pid = int(sys.argv[4]) except ValueError: pid = start_proc(sys.argv[4]) + need_kill = True except ValueError as e: print(f"Invalid argument: {e}") print_usage() sys.exit(1) blocked_sample(pid, interval, count, blockedSample) - end_proc(pid) + if need_kill == True: + end_proc(pid) if __name__ == "__main__": main() \ No newline at end of file diff --git a/python/modules/_libkperf/Pmu.py b/python/modules/_libkperf/Pmu.py index b2236d6..74f2c9d 100644 --- a/python/modules/_libkperf/Pmu.py +++ b/python/modules/_libkperf/Pmu.py @@ -1087,7 +1087,7 @@ class CtypesPmuData(ctypes.Structure): ('cpu', ctypes.c_int), ('cpuTopo', ctypes.POINTER(CtypesCpuTopology)), ('comm', ctypes.c_char_p), - ('period', ctypes.c_int), + ('period', ctypes.c_uint64), ('count', ctypes.c_uint64), ('countPercent', ctypes.c_double), ('ext', ctypes.POINTER(CtypesPmuDataExt)), @@ -1119,7 +1119,7 @@ class CtypesPmuData(ctypes.Structure): self.cpu = ctypes.c_int(cpu) self.cpuTopo = cpuTopo self.comm = ctypes.c_char_p(comm.encode(UTF_8)) - self.period = ctypes.c_int(period) + self.period = ctypes.c_uint64(period) self.count = ctypes.c_uint64(count) self.countPercent = ctypes.c_double(countPercent) self.ext = ext @@ -1233,7 +1233,7 @@ class ImplPmuData: @period.setter def period(self, period: int) -> None: - self.c_pmu_data.period = ctypes.c_int(period) + self.c_pmu_data.period = ctypes.c_uint64(period) @property def count(self) -> int: -- Gitee From 5bdbdde2091a2daf04385e3137df34f9dd2526ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Tue, 29 Apr 2025 09:54:46 +0800 Subject: [PATCH 10/14] fix ddrc bandwidth compute --- example/case/ddrc_case.cpp | 2 +- example/pmu_hotspot.cpp | 2 +- example/pmu_hotspot.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/case/ddrc_case.cpp b/example/case/ddrc_case.cpp index 7b8c143..9ee4b8f 100644 --- a/example/case/ddrc_case.cpp +++ b/example/case/ddrc_case.cpp @@ -17,7 +17,7 @@ void memory_read_test(std::vector &array) { auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; - double readCnt = (array.size() * sizeof(int)) / (elapsed.count() * 1e9); // GB/s + double readCnt = (array.size() * sizeof(int)) / (elapsed.count() * 1024 * 1024 * 1024); // GB/s std::cout << "Data throughput: " << readCnt << " GB/s" << std::endl; start = end; // Reset timer diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index 52d9e5d..c3b6626 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -298,7 +298,7 @@ int main(int argc, char** argv) return EXIT_FAILURE; } BlockedSample(pid, interval, count, blockedSample); - if (needKill == true) { + if (needKill) { EndProc(pid); } return 0; diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index 29b075c..a1ca675 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -227,7 +227,7 @@ def main(): print_usage() sys.exit(1) blocked_sample(pid, interval, count, blockedSample) - if need_kill == True: + if need_kill: end_proc(pid) if __name__ == "__main__": -- Gitee From 9333f006ef68491b34efda3477a2159e2ca44af0 Mon Sep 17 00:00:00 2001 From: echodo <2220386943@qq.com> Date: Wed, 14 May 2025 11:44:54 +0800 Subject: [PATCH 11/14] support python3.6 && python whl format --- build.sh | 8 ++++++-- python/CMakeLists.txt | 4 ++-- python/modules/CMakeLists.txt | 7 ++++++- python/modules/setup.py.in | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/build.sh b/build.sh index 77bf34f..0e0ef3b 100644 --- a/build.sh +++ b/build.sh @@ -35,6 +35,7 @@ creat_dir "${BUILD_DIR}" export CC=gcc export CXX=g++ PYTHON_EXE="" +PYTHON_WHL=false if [ -d "${THIRD_PARTY}/local" ];then echo ${THIRD_PARTY}/local "is exist" else @@ -56,6 +57,9 @@ for arg in "$@"; do build_type=*) BUILD_TYPE="${arg#*=}" ;; + whl=*) + WHL="${arg#*=}" + ;; python_exe=*) PYTHON_EXE="${arg#*=}" ;; @@ -100,9 +104,9 @@ build_libkperf() cd $BUILD_DIR # Remove the PYTHON_KPERF warning if [ -z ${PYTHON_EXE} ];then - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} .. + cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} .. else - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DPYTHON_KPERF=${PYTHON_EXE} .. + cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DPYTHON_KPERF=${PYTHON_EXE} .. fi make -j ${cpu_core_num} make install diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 12a7af4..b35bb89 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -15,8 +15,8 @@ project(python_libkperf) if(DEFINED PYTHON_KPERF AND NOT PYTHON_KPERF STREQUAL "") set(PYTHON_EXECUTABLE ${PYTHON_KPERF}) else() - find_package(PythonInterp 3.7 REQUIRED) - find_package(PythonLibs 3.7 REQUIRED) + find_package(PythonInterp 3.6 REQUIRED) + find_package(PythonLibs 3.6 REQUIRED) endif() message("PYTHON_EXECUTABLE is ${PYTHON_EXECUTABLE}") add_subdirectory(modules) \ No newline at end of file diff --git a/python/modules/CMakeLists.txt b/python/modules/CMakeLists.txt index 40b53a1..43c0f9d 100644 --- a/python/modules/CMakeLists.txt +++ b/python/modules/CMakeLists.txt @@ -20,9 +20,14 @@ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_SOURCE_DIR}/setup.py ) +if(DEFINED PYTHON_WHL AND PYTHON_WHL) + set(BIN_TYPE "bdist_wheel") +else() + set(BIN_TYPE "install") +endif() add_custom_target(${PROJECT_NAME} ALL - COMMAND ${PYTHON_EXECUTABLE} setup.py install + COMMAND ${PYTHON_EXECUTABLE} setup.py ${BIN_TYPE} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/python/modules/setup.py.in b/python/modules/setup.py.in index 89204b9..5e10f04 100644 --- a/python/modules/setup.py.in +++ b/python/modules/setup.py.in @@ -21,6 +21,6 @@ setup( name='libkperf', version='1.0', packages=find_packages(), - data_files=[('_libkperf', [libkperf_path, libsym_path])] + data_files=[('/_libkperf', [libkperf_path, libsym_path])] ) -- Gitee From 1bb4b168a945f6f699ad9bf607e06ccdd8302541 Mon Sep 17 00:00:00 2001 From: wangtingwang Date: Thu, 15 May 2025 10:23:15 +0800 Subject: [PATCH 12/14] remove cmake PYTHON_WHL warning --- build.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/build.sh b/build.sh index 0e0ef3b..b4bb419 100644 --- a/build.sh +++ b/build.sh @@ -102,12 +102,22 @@ function build_elfin() { build_libkperf() { cd $BUILD_DIR - # Remove the PYTHON_KPERF warning - if [ -z ${PYTHON_EXE} ];then - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} .. - else - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DPYTHON_KPERF=${PYTHON_EXE} .. + # Remove the PYTHON_KPERF && PYTHON_WHL warning + CMAKE_ARGS=() + CMAKE_ARGS+=( + "-DINCLUDE_TEST=${INCLUDE_TEST}" + "-DPYTHON=${PYTHON}" + "-DGO=${GO}" + "-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" + "-DCMAKE_BUILD_TYPE=${BUILD_TYPE}" + ) + if [ !-z ${PYTHON_EXE} ];then + CMAKE_ARGS+=("-DPYTHON_KPERF=${PYTHON_EXE}") fi + if [ ${PYTHON} ];then + CMAKE_ARGS+=("-DPYTHON_WHL=${WHL}") + fi + cmake "${CMAKE_ARGS[@]}" .. make -j ${cpu_core_num} make install echo "build libkperf success" -- Gitee From 26aa5f2070622e57251594d6e4c53f325495fa84 Mon Sep 17 00:00:00 2001 From: twwang <920347125@qq.com> Date: Thu, 15 May 2025 12:22:34 +0800 Subject: [PATCH 13/14] fix build.sh PYTHON_WHL & PYTHON_EXE condition bug --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index b4bb419..6fb1300 100644 --- a/build.sh +++ b/build.sh @@ -111,10 +111,10 @@ build_libkperf() "-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" "-DCMAKE_BUILD_TYPE=${BUILD_TYPE}" ) - if [ !-z ${PYTHON_EXE} ];then + if [ ! -z ${PYTHON_EXE} ];then CMAKE_ARGS+=("-DPYTHON_KPERF=${PYTHON_EXE}") fi - if [ ${PYTHON} ];then + if [ "${PYTHON}" = "true" ];then CMAKE_ARGS+=("-DPYTHON_WHL=${WHL}") fi cmake "${CMAKE_ARGS[@]}" .. -- Gitee From 7916a380af55532961387ab40efa364718fb3ed9 Mon Sep 17 00:00:00 2001 From: twwang <920347125@qq.com> Date: Thu, 15 May 2025 15:56:38 +0800 Subject: [PATCH 14/14] fix setup.py.in _libkperf path bug --- python/modules/CMakeLists.txt | 11 +++++++---- python/modules/setup.py.in | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/modules/CMakeLists.txt b/python/modules/CMakeLists.txt index 43c0f9d..7b8bafc 100644 --- a/python/modules/CMakeLists.txt +++ b/python/modules/CMakeLists.txt @@ -16,16 +16,19 @@ project(python_libkperf) set(LIBKPERF_PATH ${CMAKE_BINARY_DIR}/pmu/libkperf.so) set(LIBSYM_PATH ${CMAKE_BINARY_DIR}/symbol/libsym.so) -configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in - ${CMAKE_CURRENT_SOURCE_DIR}/setup.py -) if(DEFINED PYTHON_WHL AND PYTHON_WHL) set(BIN_TYPE "bdist_wheel") + set(SETUP_LIBKPERF_PATH "/_libkperf") else() set(BIN_TYPE "install") + set(SETUP_LIBKPERF_PATH "_libkperf") endif() +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_SOURCE_DIR}/setup.py +) + add_custom_target(${PROJECT_NAME} ALL COMMAND ${PYTHON_EXECUTABLE} setup.py ${BIN_TYPE} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/python/modules/setup.py.in b/python/modules/setup.py.in index 5e10f04..7e9af74 100644 --- a/python/modules/setup.py.in +++ b/python/modules/setup.py.in @@ -16,11 +16,12 @@ from setuptools import setup, find_packages libkperf_path = '@LIBKPERF_PATH@' libsym_path = '@LIBSYM_PATH@' +_libkperf_path = '@SETUP_LIBKPERF_PATH@' setup( name='libkperf', version='1.0', packages=find_packages(), - data_files=[('/_libkperf', [libkperf_path, libsym_path])] + data_files=[(_libkperf_path, [libkperf_path, libsym_path])] ) -- Gitee