From 8f0c72cc3e0088cdeb7017932562437d02e5dded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Wed, 23 Apr 2025 17:51:29 +0800 Subject: [PATCH 01/12] fix for not souppot cpu info --- include/pcerrc.h | 1 + pmu/pmu_metric.cpp | 10 +++++++++- python/modules/kperf/perror.py | 1 + test/test_perf/test_metric.cpp | 1 + 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/pcerrc.h b/include/pcerrc.h index 55a2f9e..767a48b 100644 --- a/include/pcerrc.h +++ b/include/pcerrc.h @@ -106,6 +106,7 @@ extern "C" { #define LIBPERF_ERR_CPUFREQ_NOT_CONFIG 1062 #define LIBPERF_ERR_CLUSTER_ID_OVERSIZE 1063 #define LIBPERF_ERR_INVALID_PMU_BDF_TYPE 1064 +#define LIBPERF_ERR_NOT_SUPPORT_METRIC 1065 #define UNKNOWN_ERROR 9999 diff --git a/pmu/pmu_metric.cpp b/pmu/pmu_metric.cpp index 6dffb3b..a2163a1 100644 --- a/pmu/pmu_metric.cpp +++ b/pmu/pmu_metric.cpp @@ -298,7 +298,11 @@ namespace KUNPENG_PMU { static const map GetDeviceMtricConfig() { - return UNCORE_METRIC_CONFIG_MAP.at(GetCpuType()); + CHIP_TYPE chipType = GetCpuType(); + if (UNCORE_METRIC_CONFIG_MAP.find(chipType) == UNCORE_METRIC_CONFIG_MAP.end()) { + return {}; + } + return UNCORE_METRIC_CONFIG_MAP.at(chipType); } static int QueryUncoreRawDevices() @@ -757,6 +761,10 @@ namespace KUNPENG_PMU { static int CheckDeviceMetricEnum(PmuDeviceMetric metric) { const auto& metricConfig = GetDeviceMtricConfig(); + if (metricConfig.empty()) { + New(LIBPERF_ERR_NOT_SUPPORT_METRIC, "The current platform cpu does not support uncore metric collection."); + return LIBPERF_ERR_NOT_SUPPORT_METRIC; + } if (metricConfig.find(metric) == metricConfig.end()) { New(LIBPERF_ERR_INVALID_PMU_DEVICES_METRIC, "For this platform this metric " + GetMetricString(metric) + " is invalid value for PmuDeviceMetric!"); diff --git a/python/modules/kperf/perror.py b/python/modules/kperf/perror.py index 5183196..bc8c8e2 100644 --- a/python/modules/kperf/perror.py +++ b/python/modules/kperf/perror.py @@ -105,6 +105,7 @@ class Error: LIBPERF_ERR_CPUFREQ_NOT_CONFIG = 1062 LIBPERF_ERR_CLUSTER_ID_OVERSIZE = 1063 LIBPERF_ERR_INVALID_PMU_BDF_TYPE = 1064 + LIBPERF_ERR_NOT_SUPPORT_METRIC = 1065 UNKNOWN_ERROR = 9999 diff --git a/test/test_perf/test_metric.cpp b/test/test_perf/test_metric.cpp index 85b8653..56bee10 100644 --- a/test/test_perf/test_metric.cpp +++ b/test/test_perf/test_metric.cpp @@ -107,6 +107,7 @@ TEST_F(TestMetric, CollectDDRBandwidth) PmuDeviceAttr devAttr = {}; devAttr.metric = PMU_DDR_READ_BW; int pd = PmuDeviceOpen(&devAttr, 1); + cout << Perror() << endl; ASSERT_NE(pd, -1); PmuEnable(pd); sleep(1); -- Gitee From 346d0ed35a411ebfd668ce84136e512ad94d86c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Mon, 28 Apr 2025 15:53:10 +0800 Subject: [PATCH 02/12] add blocked_sample case process, opt stack only print top 10 --- example/case/blocked_sample_case.cpp | 465 +++++++++++++++++++++++++++ example/pmu_hotspot.cpp | 6 +- example/pmu_hotspot.go | 8 +- example/pmu_hotspot.py | 5 +- 4 files changed, 479 insertions(+), 5 deletions(-) create mode 100644 example/case/blocked_sample_case.cpp diff --git a/example/case/blocked_sample_case.cpp b/example/case/blocked_sample_case.cpp new file mode 100644 index 0000000..4db89e7 --- /dev/null +++ b/example/case/blocked_sample_case.cpp @@ -0,0 +1,465 @@ +/* + * Optimization Notes: + * + * This program constructs a multi-threaded task, where each task consists of three phases: + * + * 1. on-CPU computation: + * Two modes are provided: + * - inefficient: Simulates inefficient computation using heavy floating-point operations (default). + * - efficient: Uses integers instead of floating-point numbers for optimized computation + * (though more efficient, overall time remains almost unchanged as off-CPU phase (synchronous IO) is the bottleneck). + * + * 2. IO operation phase: + * Three modes are provided: + * - global: Write to a single file protected by a global lock (baseline). + * - split: Each thread writes to its own file (reduces lock contention). + * - async: Asynchronous IO, enqueues data for background batch writing (previous version lacked batching, causing worse performance). + * + * 3. Supplemental on-CPU computation. + * + * Usage (command-line argument order): + * [numThreads] [tasksPerThread] [cpuIterations] [ioDataSize] [ioWrites] [ioMode] [onCpuMode] + * + * Example (your given test parameters, plus onCpuMode parameter): + * ./blocked_sample_io 4 50 100000 5000 3000 global inefficient + * + * Where: + * ioMode: global|split|async + * onCpuMode: inefficient (inefficient implementation) or efficient (optimized implementation) + * + * Note: If the user attempts to optimize the CPU computation part using the efficient on-CPU mode, + * the overall runtime remains almost unchanged, proving that the bottleneck lies mainly in the off-CPU part (synchronous IO and lock contention). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +using namespace std::chrono; + +// Define IO mode enumeration +enum class IOMode { GLOBAL, SPLIT, ASYNC }; +IOMode currentIOMode = IOMode::GLOBAL; // Default IO mode + +//------------------------------------------------------------- +// on-CPU simulation: Implementation of two computation methods +//------------------------------------------------------------- +// Inefficient CPU work: Heavy loop computation to prevent compiler optimization +void doOnCpuWorkInefficient(int iterations) { + volatile double dummy = 1.0; + for (int i = 0; i < iterations; i++) { + dummy = dummy * 1.000001 + 0.000001; + } + (void)dummy; +} + +// Efficient CPU work: Use integers to simulate decimals for optimized computation +void doOnCpuWorkEfficient(int iterations) { + long long dummy = 1000000; // Use integers to simulate decimals, assuming precision of 1e-6 + for (int i = 0; i < iterations; i++) { + dummy = dummy * 1000001 / 1000000 + 1; + } + (void)dummy; +} + +// Global flag to decide which on-CPU computation method to use (default is inefficient) +bool efficientOnCpu = false; + +// Encapsulated on-CPU work interface, calls corresponding implementation based on efficientOnCpu +void doOnCpuWork(int iterations) { + if (efficientOnCpu) { + doOnCpuWorkEfficient(iterations); + } else { + doOnCpuWorkInefficient(iterations); + } +} + +//------------------------------------------------------------- +// GLOBAL mode: Global file and mutex +//------------------------------------------------------------- +mutex globalFileMutex; +ofstream globalSyncFile; // Global file + +//------------------------------------------------------------- +// Asynchronous IO Manager (optimized): Batch writing to reduce flush frequency +//------------------------------------------------------------- +class AsyncIOManager { +private: + queue msgQueue; + mutex mtx; + condition_variable cv; + atomic stop; + thread worker; + ofstream outFile; + const size_t batchSize; // Number of messages written per batch + +public: + AsyncIOManager(const string& filename, size_t batchSize = 50) + : stop(false), batchSize(batchSize) + { + outFile.open(filename, ios::out | ios::trunc); + if (!outFile.is_open()){ + cerr << "Failed to open file: " << filename << endl; + } + worker = thread([this]() { this->process(); }); + } + + ~AsyncIOManager(){ + { + lock_guard lock(mtx); + stop = true; + } + cv.notify_one(); + if(worker.joinable()){ + worker.join(); + } + if(outFile.is_open()){ + outFile.close(); + } + } + + // Push message to be written into the queue + void push(const string &msg) { + { + lock_guard lock(mtx); + msgQueue.push(msg); + } + cv.notify_one(); + } + +private: + // Background thread processes batch writes + void process() { + while (true) { + vector localBatch; + { + unique_lock lock(mtx); + cv.wait(lock, [this]() { return stop || !msgQueue.empty(); }); + while (!msgQueue.empty() && localBatch.size() < batchSize) { + localBatch.push_back(msgQueue.front()); + msgQueue.pop(); + } + if (stop && localBatch.empty()) { + break; + } + } + // Merge and write batch, then flush + if (outFile.is_open()) { + string batchStr; + for (const auto &msg : localBatch) { + batchStr.append(msg); + } + outFile << batchStr; + outFile.flush(); + } + } + } +}; + +AsyncIOManager *asyncIO = nullptr; // Global pointer to asynchronous IO manager + +//------------------------------------------------------------- +// Thread Pool: Manages worker threads and task queue +//------------------------------------------------------------- +class ThreadPool { +public: + ThreadPool(size_t threads); + ~ThreadPool(); + void enqueue(function task); + void wait(); + +private: + vector workers; + queue> tasks; + mutex queue_mutex; + condition_variable condition; + atomic stop; + atomic active_tasks; + condition_variable cv_finished; +}; + +ThreadPool::ThreadPool(size_t threads) : stop(false), active_tasks(0) { + for (size_t i = 0; i < threads; i++) { + workers.emplace_back([this, i]() { + while (true) { + function task; + { + unique_lock lock(this->queue_mutex); + this->condition.wait(lock, [this]() { + return this->stop.load() || !this->tasks.empty(); + }); + if (this->stop.load() && this->tasks.empty()) + return; + task = move(this->tasks.front()); + this->tasks.pop(); + active_tasks++; + } + task(); + { + lock_guard lock(this->queue_mutex); + active_tasks--; + if (tasks.empty() && active_tasks == 0) { + cv_finished.notify_all(); + } + } + } + }); + } +} + +ThreadPool::~ThreadPool() { + { + lock_guard lock(queue_mutex); + stop.store(true); + } + condition.notify_all(); + for (thread &worker : workers) { + if (worker.joinable()) + worker.join(); + } +} + +void ThreadPool::enqueue(function task) { + { + lock_guard lock(queue_mutex); + tasks.push(move(task)); + } + condition.notify_one(); +} + +void ThreadPool::wait() { + unique_lock lock(queue_mutex); + cv_finished.wait(lock, [this]() { + return tasks.empty() && active_tasks == 0; + }); +} + +//------------------------------------------------------------- +// Helper functions: Print divider and usage instructions +//------------------------------------------------------------- +void printDivider() { + cout << string(60, '-') << endl; +} + +void printUsage(const char* programName) { + cout << "Usage: " << programName << " [numThreads] [tasksPerThread] [cpuIterations] [ioDataSize] [ioWrites] [ioMode] [onCpuMode]" << endl; + cout << " numThreads: Number of worker threads (default: 4)" << endl; + cout << " tasksPerThread: Number of tasks per thread (default: 50)" << endl; + cout << " cpuIterations: Number of on-CPU computation iterations (default: 100000)" << endl; + cout << " ioDataSize: Number of characters written per synchronous IO operation (default: 5000)" << endl; + cout << " ioWrites: Number of IO operations per task (default: 3000)" << endl; + cout << " ioMode: IO mode, options: global, split, async (default: global)" << endl; + cout << " onCpuMode: on-CPU mode, options: inefficient, efficient (default: inefficient)" << endl; +} + +//------------------------------------------------------------- +// GLOBAL mode IO operation: Write to global file with global lock +//------------------------------------------------------------- +void doGlobalIOWork(int taskId, int ioDataSize, int ioWrites) { + stringstream ss; + ss << "Task " << taskId << " data: "; + for (int i = 0; i < ioDataSize; i++) { + ss << "X"; + } + ss << "\n"; + string data = ss.str(); + for (int i = 0; i < ioWrites; i++) { + { + lock_guard lock(globalFileMutex); + if (globalSyncFile.is_open()) { + globalSyncFile << data; + globalSyncFile.flush(); + } + } + doOnCpuWork(1000); + } +} + +//------------------------------------------------------------- +// SPLIT mode IO operation: Each thread writes to its own file +//------------------------------------------------------------- +void doSplitIOWork(int taskId, int ioDataSize, int ioWrites) { + stringstream ss; + ss << "Task " << taskId << " data: "; + for (int i = 0; i < ioDataSize; i++) { + ss << "X"; + } + ss << "\n"; + string data = ss.str(); + static thread_local ofstream localFile; + static thread_local bool initialized = false; + if (!initialized) { + auto tid = this_thread::get_id(); + hash hasher; + size_t id_hash = hasher(tid); + string filename = "split_output_" + to_string(id_hash) + ".txt"; + localFile.open(filename, ios::out | ios::trunc); + if (!localFile.is_open()) { + cerr << "Failed to open file: " << filename << endl; + } + initialized = true; + } + for (int i = 0; i < ioWrites; i++) { + localFile << data; + localFile.flush(); + doOnCpuWork(1000); + } +} + +//------------------------------------------------------------- +// ASYNC mode IO operation: Push data into asynchronous queue +//------------------------------------------------------------- +void doAsyncIOWork(int taskId, int ioDataSize, int ioWrites) { + stringstream ss; + ss << "Task " << taskId << " data: "; + for (int i = 0; i < ioDataSize; i++) { + ss << "X"; + } + ss << "\n"; + string data = ss.str(); + for (int i = 0; i < ioWrites; i++) { + if (asyncIO) { + asyncIO->push(data); + } + doOnCpuWork(1000); + } +} + +//------------------------------------------------------------- +// Task processing: on-CPU computation -> IO operation -> small amount of on-CPU computation +//------------------------------------------------------------- +void processTask(int taskId, int cpuIterations, int ioDataSize, int ioWrites) { + // Phase 1: on-CPU computation (choose implementation based on onCpuMode) + doOnCpuWork(cpuIterations); + + // Phase 2: IO operation, choose execution method based on current IO mode + if (currentIOMode == IOMode::GLOBAL) { + doGlobalIOWork(taskId, ioDataSize, ioWrites); + } else if (currentIOMode == IOMode::SPLIT) { + doSplitIOWork(taskId, ioDataSize, ioWrites); + } else if (currentIOMode == IOMode::ASYNC) { + doAsyncIOWork(taskId, ioDataSize, ioWrites); + } + + // Phase 3: Small amount of additional on-CPU computation + doOnCpuWork(cpuIterations / 10); +} + +//------------------------------------------------------------- +// main function: Parse arguments, initialize IO & on-CPU modes, start thread pool, and measure elapsed time +//------------------------------------------------------------- +int main(int argc, char* argv[]) { + // Default parameters + int numThreads = 4; + int tasksPerThread = 50; + int cpuIterations = 100000; + int ioDataSize = 5000; + int ioWrites = 3000; + string ioModeStr = "global"; // Default IO mode + string onCpuModeStr = "inefficient"; // Default on-CPU mode + + // Argument check and help information + if (argc > 1) { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) { + printUsage(argv[0]); + return 0; + } + } + if (argc > 1) { numThreads = atoi(argv[1]); } + if (argc > 2) { tasksPerThread = atoi(argv[2]); } + if (argc > 3) { cpuIterations = atoi(argv[3]); } + if (argc > 4) { ioDataSize = atoi(argv[4]); } + if (argc > 5) { ioWrites = atoi(argv[5]); } + if (argc > 6) { ioModeStr = argv[6]; } + if (argc > 7) { onCpuModeStr = argv[7]; } + + // Determine current IO mode based on ioMode parameter + if (ioModeStr == "global") { + currentIOMode = IOMode::GLOBAL; + cout << "Using GLOBAL mode: Writing to global file with global mutex protection" << endl; + } else if (ioModeStr == "split") { + currentIOMode = IOMode::SPLIT; + cout << "Using SPLIT mode: Each thread writes to its own file, reducing lock granularity" << endl; + } else if (ioModeStr == "async") { + currentIOMode = IOMode::ASYNC; + cout << "Using ASYNC mode: Asynchronous IO, background thread performs batch writes" << endl; + } else { + cout << "Unknown IO mode, defaulting to GLOBAL mode" << endl; + currentIOMode = IOMode::GLOBAL; + } + + // Determine on-CPU mode based on onCpuMode parameter + if (onCpuModeStr == "efficient") { + efficientOnCpu = true; + cout << "Using efficient on-CPU implementation" << endl; + } else { + efficientOnCpu = false; + cout << "Using inefficient on-CPU implementation (default)" << endl; + } + + int totalTasks = numThreads * tasksPerThread; + printDivider(); + cout << "Program configuration:" << endl; + cout << " Number of worker threads (numThreads): " << numThreads << endl; + cout << " Number of tasks per thread (tasksPerThread): " << tasksPerThread << endl; + cout << " Total number of tasks: " << totalTasks << endl; + cout << " On-CPU computation iterations (cpuIterations): " << cpuIterations << endl; + cout << " Characters written per IO operation (ioDataSize): " << ioDataSize << endl; + cout << " Number of IO operations per task (ioWrites): " << ioWrites << endl; + cout << " IO mode (ioMode): " << ioModeStr << endl; + cout << " on-CPU mode (onCpuMode): " << onCpuModeStr << endl; + printDivider(); + + // Perform necessary initialization based on IO mode + if (currentIOMode == IOMode::GLOBAL) { + globalSyncFile.open("global_output.txt", ios::out | ios::trunc); + if (!globalSyncFile.is_open()){ + cerr << "Failed to open global_output.txt file. Please check permissions or path." << endl; + return 1; + } + } else if (currentIOMode == IOMode::ASYNC) { + asyncIO = new AsyncIOManager("async_output.txt", 50); + } + + // Create thread pool, distribute tasks, and measure total elapsed time + ThreadPool pool(numThreads); + auto startTime = high_resolution_clock::now(); + for (int i = 0; i < totalTasks; i++) { + pool.enqueue([=]() { + processTask(i, cpuIterations, ioDataSize, ioWrites); + }); + } + pool.wait(); + auto endTime = high_resolution_clock::now(); + duration elapsed = endTime - startTime; + + // Cleanup resources + if (currentIOMode == IOMode::GLOBAL) { + globalSyncFile.close(); + } else if (currentIOMode == IOMode::ASYNC) { + delete asyncIO; + asyncIO = nullptr; + } + + printDivider(); + cout << "Completed " << totalTasks << " tasks in " + << fixed << setprecision(2) << elapsed.count() << " seconds." << endl; + cout << "Current IO mode: " << ioModeStr << ", on-CPU mode: " << onCpuModeStr << endl; + cout << "Optimization direction: Reducing lock granularity/scattered writes or adopting batch asynchronous IO can effectively alleviate off-CPU bottlenecks;" << endl; + cout << " Even with an efficient on-CPU implementation, there will be no significant impact on overall runtime." << endl; + printDivider(); + + return 0; +} \ No newline at end of file diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index cdfeee3..dadf584 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -217,7 +217,11 @@ void BlockedSample(int pid, double interval, int count, bool blockedSample) std::cout << std::string(50, '=') << std::endl; std::cout << std::setw(40) << "@symbol" << std::setw(40) << "@module"; std::cout << std::setw(40) << std::right << "@percent" << std::endl; - for (int i = 0; i < hotSpotData.size(); ++i) { + int stackLen = hotSpotData.size(); + if (stackLen > 10) { + stackLen = 10; // Only print top 10 hotspots stack. + } + for (int i = 0; i < stackLen; ++i) { PrintStack(hotSpotData[i].stack, 0, hotSpotData[i].period); } g_totalPeriod = 0; diff --git a/example/pmu_hotspot.go b/example/pmu_hotspot.go index ef3c1bd..8fc5d59 100644 --- a/example/pmu_hotspot.go +++ b/example/pmu_hotspot.go @@ -215,8 +215,12 @@ func blockSample(pid int, interval float64, count int, blockedSample int) { printHotSpotGraph(hotspotData) fmt.Printf(strings.Repeat("=", 50) + "Print the call stack of the hotspot function" + strings.Repeat("=", 50) + "\n") fmt.Printf("% -40s%-40s%+40s\n", "@symbol", "@module", "@percent") - for _, data := range hotspotData { - printStack(data.Symbols, data.Period) + stackLen := len(hotspotData) + if stackLen > 10 { + stackLen = 10 + } + for i := 0; i < stackLen; i++ { + printStack(hotspotData[i].Symbols, hotspotData[i].Period) } GlobalPeriod = 0 } diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index eea4b58..61619f4 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -163,8 +163,9 @@ def blocked_sample(pid, interval, count, blockedSample): print_hotspot_graph(hotspot_data) print("=" * 50 + "Print the call stack of the hotspot function" + "=" * 50) print(f"{'@symbol':<40}{'@module':<40}{'@percent':>40}") - for data in hotspot_data: - print_stack(data.stack, 0, data.period) + stack_len = min(10, len(hotspot_data)) + for i in range(stack_len): + print_stack(hotspot_data[i].stack, 0, hotspot_data[i].period) g_total_period = 0 err = kperf.disable(pd) if err != 0: -- Gitee From da7e0e9de5e935fb0d5dc977d66a107d96c0680e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Mon, 28 Apr 2025 19:39:56 +0800 Subject: [PATCH 03/12] add ddrc case --- example/case/ddrc_case.cpp | 31 +++++++++++++++++++++++++++++++ example/pmu_hotspot.cpp | 7 +++++-- example/pmu_hotspot.py | 5 ++++- python/modules/_libkperf/Pmu.py | 6 +++--- 4 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 example/case/ddrc_case.cpp diff --git a/example/case/ddrc_case.cpp b/example/case/ddrc_case.cpp new file mode 100644 index 0000000..7b8c143 --- /dev/null +++ b/example/case/ddrc_case.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include + +#define ARRAY_SIZE (1024 * 1024 * 512) // 512MB, ensuring it exceeds L3 cache +#define STRIDE 64 // Memory access stride (simulating cache line access) + +void memory_read_test(std::vector &array) { + volatile int sum = 0; // Prevent compiler optimization + auto start = std::chrono::high_resolution_clock::now(); + + while (true) { // Infinite loop + for (size_t i = 0; i < array.size(); i += STRIDE) { + sum += array[i]; // Memory access operation + } + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + double readCnt = (array.size() * sizeof(int)) / (elapsed.count() * 1e9); // GB/s + + std::cout << "Data throughput: " << readCnt << " GB/s" << std::endl; + start = end; // Reset timer + } +} + +int main() { + std::vector memory_array(ARRAY_SIZE, 1); // Initialize a large array + memory_read_test(memory_array); + return 0; +} \ No newline at end of file diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index dadf584..52d9e5d 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -272,6 +272,7 @@ int main(int argc, char** argv) int count = 0; bool blockedSample = false; int pid = 0; + bool needKill = false; try { interval = std::stod(argv[1]); if (interval <= 0) { @@ -289,6 +290,7 @@ int main(int argc, char** argv) pid = std::stoi(argv[4]); } catch (const std::invalid_argument&) { StartProc(argv[4], pid); + needKill = true; } } catch (const std::exception& e) { std::cerr << "Error parsing arguments: " << e.what() << "\n"; @@ -296,7 +298,8 @@ int main(int argc, char** argv) return EXIT_FAILURE; } BlockedSample(pid, interval, count, blockedSample); - EndProc(pid); - + if (needKill == true) { + EndProc(pid); + } return 0; } \ No newline at end of file diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index 61619f4..29b075c 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -215,17 +215,20 @@ def main(): if blockedSample not in (0, 1): raise ValueError("BlockedSample must be 0 or 1.") + need_kill = False try: pid = int(sys.argv[4]) except ValueError: pid = start_proc(sys.argv[4]) + need_kill = True except ValueError as e: print(f"Invalid argument: {e}") print_usage() sys.exit(1) blocked_sample(pid, interval, count, blockedSample) - end_proc(pid) + if need_kill == True: + end_proc(pid) if __name__ == "__main__": main() \ No newline at end of file diff --git a/python/modules/_libkperf/Pmu.py b/python/modules/_libkperf/Pmu.py index 51512a0..ae0847c 100644 --- a/python/modules/_libkperf/Pmu.py +++ b/python/modules/_libkperf/Pmu.py @@ -1087,7 +1087,7 @@ class CtypesPmuData(ctypes.Structure): ('cpu', ctypes.c_int), ('cpuTopo', ctypes.POINTER(CtypesCpuTopology)), ('comm', ctypes.c_char_p), - ('period', ctypes.c_int), + ('period', ctypes.c_uint64), ('count', ctypes.c_uint64), ('countPercent', ctypes.c_double), ('ext', ctypes.POINTER(CtypesPmuDataExt)), @@ -1119,7 +1119,7 @@ class CtypesPmuData(ctypes.Structure): self.cpu = ctypes.c_int(cpu) self.cpuTopo = cpuTopo self.comm = ctypes.c_char_p(comm.encode(UTF_8)) - self.period = ctypes.c_int(period) + self.period = ctypes.c_uint64(period) self.count = ctypes.c_uint64(count) self.countPercent = ctypes.c_double(countPercent) self.ext = ext @@ -1233,7 +1233,7 @@ class ImplPmuData: @period.setter def period(self, period: int) -> None: - self.c_pmu_data.period = ctypes.c_int(period) + self.c_pmu_data.period = ctypes.c_uint64(period) @property def count(self) -> int: -- Gitee From 6a88f0ab817acec2b6d09f919bfd484bcc04870d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=96=E9=BE=99?= Date: Tue, 29 Apr 2025 09:54:46 +0800 Subject: [PATCH 04/12] fix ddrc bandwidth compute --- example/case/ddrc_case.cpp | 2 +- example/pmu_hotspot.cpp | 2 +- example/pmu_hotspot.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/case/ddrc_case.cpp b/example/case/ddrc_case.cpp index 7b8c143..9ee4b8f 100644 --- a/example/case/ddrc_case.cpp +++ b/example/case/ddrc_case.cpp @@ -17,7 +17,7 @@ void memory_read_test(std::vector &array) { auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; - double readCnt = (array.size() * sizeof(int)) / (elapsed.count() * 1e9); // GB/s + double readCnt = (array.size() * sizeof(int)) / (elapsed.count() * 1024 * 1024 * 1024); // GB/s std::cout << "Data throughput: " << readCnt << " GB/s" << std::endl; start = end; // Reset timer diff --git a/example/pmu_hotspot.cpp b/example/pmu_hotspot.cpp index 52d9e5d..c3b6626 100644 --- a/example/pmu_hotspot.cpp +++ b/example/pmu_hotspot.cpp @@ -298,7 +298,7 @@ int main(int argc, char** argv) return EXIT_FAILURE; } BlockedSample(pid, interval, count, blockedSample); - if (needKill == true) { + if (needKill) { EndProc(pid); } return 0; diff --git a/example/pmu_hotspot.py b/example/pmu_hotspot.py index 29b075c..a1ca675 100644 --- a/example/pmu_hotspot.py +++ b/example/pmu_hotspot.py @@ -227,7 +227,7 @@ def main(): print_usage() sys.exit(1) blocked_sample(pid, interval, count, blockedSample) - if need_kill == True: + if need_kill: end_proc(pid) if __name__ == "__main__": -- Gitee From 3148fa3902324fa9f46b6c98027770c4cc5f3dc1 Mon Sep 17 00:00:00 2001 From: echodo <2220386943@qq.com> Date: Wed, 14 May 2025 11:44:54 +0800 Subject: [PATCH 05/12] support python3.6 && python whl format --- build.sh | 8 ++++++-- python/CMakeLists.txt | 4 ++-- python/modules/CMakeLists.txt | 7 ++++++- python/modules/setup.py.in | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/build.sh b/build.sh index 77bf34f..0e0ef3b 100644 --- a/build.sh +++ b/build.sh @@ -35,6 +35,7 @@ creat_dir "${BUILD_DIR}" export CC=gcc export CXX=g++ PYTHON_EXE="" +PYTHON_WHL=false if [ -d "${THIRD_PARTY}/local" ];then echo ${THIRD_PARTY}/local "is exist" else @@ -56,6 +57,9 @@ for arg in "$@"; do build_type=*) BUILD_TYPE="${arg#*=}" ;; + whl=*) + WHL="${arg#*=}" + ;; python_exe=*) PYTHON_EXE="${arg#*=}" ;; @@ -100,9 +104,9 @@ build_libkperf() cd $BUILD_DIR # Remove the PYTHON_KPERF warning if [ -z ${PYTHON_EXE} ];then - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} .. + cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} .. else - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DPYTHON_KPERF=${PYTHON_EXE} .. + cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DPYTHON_KPERF=${PYTHON_EXE} .. fi make -j ${cpu_core_num} make install diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 12a7af4..b35bb89 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -15,8 +15,8 @@ project(python_libkperf) if(DEFINED PYTHON_KPERF AND NOT PYTHON_KPERF STREQUAL "") set(PYTHON_EXECUTABLE ${PYTHON_KPERF}) else() - find_package(PythonInterp 3.7 REQUIRED) - find_package(PythonLibs 3.7 REQUIRED) + find_package(PythonInterp 3.6 REQUIRED) + find_package(PythonLibs 3.6 REQUIRED) endif() message("PYTHON_EXECUTABLE is ${PYTHON_EXECUTABLE}") add_subdirectory(modules) \ No newline at end of file diff --git a/python/modules/CMakeLists.txt b/python/modules/CMakeLists.txt index 40b53a1..43c0f9d 100644 --- a/python/modules/CMakeLists.txt +++ b/python/modules/CMakeLists.txt @@ -20,9 +20,14 @@ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_SOURCE_DIR}/setup.py ) +if(DEFINED PYTHON_WHL AND PYTHON_WHL) + set(BIN_TYPE "bdist_wheel") +else() + set(BIN_TYPE "install") +endif() add_custom_target(${PROJECT_NAME} ALL - COMMAND ${PYTHON_EXECUTABLE} setup.py install + COMMAND ${PYTHON_EXECUTABLE} setup.py ${BIN_TYPE} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/python/modules/setup.py.in b/python/modules/setup.py.in index 89204b9..5e10f04 100644 --- a/python/modules/setup.py.in +++ b/python/modules/setup.py.in @@ -21,6 +21,6 @@ setup( name='libkperf', version='1.0', packages=find_packages(), - data_files=[('_libkperf', [libkperf_path, libsym_path])] + data_files=[('/_libkperf', [libkperf_path, libsym_path])] ) -- Gitee From 6929beb5c3dac8a761d391bf912c897aff3227de Mon Sep 17 00:00:00 2001 From: wangtingwang Date: Thu, 15 May 2025 10:23:15 +0800 Subject: [PATCH 06/12] remove cmake PYTHON_WHL warning --- build.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/build.sh b/build.sh index 0e0ef3b..b4bb419 100644 --- a/build.sh +++ b/build.sh @@ -102,12 +102,22 @@ function build_elfin() { build_libkperf() { cd $BUILD_DIR - # Remove the PYTHON_KPERF warning - if [ -z ${PYTHON_EXE} ];then - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} .. - else - cmake -DINCLUDE_TEST=${INCLUDE_TEST} -DPYTHON=${PYTHON} -DPYTHON_WHL=${WHL} -DGO=${GO} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DPYTHON_KPERF=${PYTHON_EXE} .. + # Remove the PYTHON_KPERF && PYTHON_WHL warning + CMAKE_ARGS=() + CMAKE_ARGS+=( + "-DINCLUDE_TEST=${INCLUDE_TEST}" + "-DPYTHON=${PYTHON}" + "-DGO=${GO}" + "-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" + "-DCMAKE_BUILD_TYPE=${BUILD_TYPE}" + ) + if [ !-z ${PYTHON_EXE} ];then + CMAKE_ARGS+=("-DPYTHON_KPERF=${PYTHON_EXE}") fi + if [ ${PYTHON} ];then + CMAKE_ARGS+=("-DPYTHON_WHL=${WHL}") + fi + cmake "${CMAKE_ARGS[@]}" .. make -j ${cpu_core_num} make install echo "build libkperf success" -- Gitee From 65fb9f22932c5769faba519451d8b79b2aba72ea Mon Sep 17 00:00:00 2001 From: twwang <920347125@qq.com> Date: Thu, 15 May 2025 12:22:34 +0800 Subject: [PATCH 07/12] fix build.sh PYTHON_WHL & PYTHON_EXE condition bug --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index b4bb419..6fb1300 100644 --- a/build.sh +++ b/build.sh @@ -111,10 +111,10 @@ build_libkperf() "-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" "-DCMAKE_BUILD_TYPE=${BUILD_TYPE}" ) - if [ !-z ${PYTHON_EXE} ];then + if [ ! -z ${PYTHON_EXE} ];then CMAKE_ARGS+=("-DPYTHON_KPERF=${PYTHON_EXE}") fi - if [ ${PYTHON} ];then + if [ "${PYTHON}" = "true" ];then CMAKE_ARGS+=("-DPYTHON_WHL=${WHL}") fi cmake "${CMAKE_ARGS[@]}" .. -- Gitee From 6b6e21f348675aa7da013048bfd5b98f1b26e27b Mon Sep 17 00:00:00 2001 From: twwang <920347125@qq.com> Date: Thu, 15 May 2025 15:56:38 +0800 Subject: [PATCH 08/12] fix setup.py.in _libkperf path bug --- python/modules/CMakeLists.txt | 11 +++++++---- python/modules/setup.py.in | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/modules/CMakeLists.txt b/python/modules/CMakeLists.txt index 43c0f9d..7b8bafc 100644 --- a/python/modules/CMakeLists.txt +++ b/python/modules/CMakeLists.txt @@ -16,16 +16,19 @@ project(python_libkperf) set(LIBKPERF_PATH ${CMAKE_BINARY_DIR}/pmu/libkperf.so) set(LIBSYM_PATH ${CMAKE_BINARY_DIR}/symbol/libsym.so) -configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in - ${CMAKE_CURRENT_SOURCE_DIR}/setup.py -) if(DEFINED PYTHON_WHL AND PYTHON_WHL) set(BIN_TYPE "bdist_wheel") + set(SETUP_LIBKPERF_PATH "/_libkperf") else() set(BIN_TYPE "install") + set(SETUP_LIBKPERF_PATH "_libkperf") endif() +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_SOURCE_DIR}/setup.py +) + add_custom_target(${PROJECT_NAME} ALL COMMAND ${PYTHON_EXECUTABLE} setup.py ${BIN_TYPE} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/python/modules/setup.py.in b/python/modules/setup.py.in index 5e10f04..7e9af74 100644 --- a/python/modules/setup.py.in +++ b/python/modules/setup.py.in @@ -16,11 +16,12 @@ from setuptools import setup, find_packages libkperf_path = '@LIBKPERF_PATH@' libsym_path = '@LIBSYM_PATH@' +_libkperf_path = '@SETUP_LIBKPERF_PATH@' setup( name='libkperf', version='1.0', packages=find_packages(), - data_files=[('/_libkperf', [libkperf_path, libsym_path])] + data_files=[(_libkperf_path, [libkperf_path, libsym_path])] ) -- Gitee From 97b4ba51b418c3b549a31644c12aa915ab94f27d Mon Sep 17 00:00:00 2001 From: wuying39 <921169248@qq.com> Date: Thu, 22 May 2025 14:56:05 +0800 Subject: [PATCH 09/12] corrected information and spelling errors --- README.en.md | 2 +- README.md | 2 +- docs/Details_Usage.md | 4 ++-- docs/Python_API.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.en.md b/README.en.md index 503138f..cae1f61 100644 --- a/README.en.md +++ b/README.en.md @@ -57,7 +57,7 @@ Minimum required GCC version: Minimum required Python version: -- python-3.7. +- python-3.6. To build a library with C API: diff --git a/README.md b/README.md index ee59dc4..e0441bc 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ v1.0: - gcc-4.8.5 和 glibc-2.17 最低依赖python版本: -- python-3.7 +- python-3.6 编译生成动态库和C的API: ```shell diff --git a/docs/Details_Usage.md b/docs/Details_Usage.md index 815f4d8..b52fb45 100644 --- a/docs/Details_Usage.md +++ b/docs/Details_Usage.md @@ -41,7 +41,7 @@ func main() { attr := kperf.PmuAttr{EvtList:[]string{"cycles", "branch-misses"}} pd, err := kperf.PmuOpen(kperf.COUNT, attr) if err != nil { - fmt.Printf("kperf pmuopen couting failed, expect err is nil, but is %v", err) + fmt.Printf("kperf pmuopen counting failed, expect err is nil, but is %v", err) return } } @@ -331,7 +331,7 @@ func main() { attr := kperf.PmuAttr{EvtList:evtList} pd, err := kperf.PmuOpen(kperf.COUNT, attr) if err != nil { - fmt.Printf("kperf pmuopen couting failed, expect err is nil, but is %v\n", err) + fmt.Printf("kperf pmuopen counting failed, expect err is nil, but is %v\n", err) return } } diff --git a/docs/Python_API.md b/docs/Python_API.md index 1ed8766..3afa10f 100644 --- a/docs/Python_API.md +++ b/docs/Python_API.md @@ -5,7 +5,7 @@ kperf.open(collector_type: kperf.PmuTaskType, pmu_attr: kperf.PmuAttr) * class PmuTaskType - * COUTING PMU计数模式 + * COUNTING PMU计数模式 * SAMPLING PMU采样模式 * SPE_SAMPLING SPE采样模式 * class PmuAttr @@ -51,7 +51,7 @@ kperf.open(collector_type: kperf.PmuTaskType, pmu_attr: kperf.PmuAttr) * SPE_EVENT_MISPREDICTED = 0x80 # mispredict * minLatency 仅收集该latency或者更高的样本数据 * includeNewFork - 是否支持子线程拆分,仅在COUTING模式中支持 + 是否支持子线程拆分,仅在COUNTING模式中支持 * branchSampleFilter * KPERF_NO_BRANCH_SAMPLE = 0 不采集branch sample stack数据 * KPERF_SAMPLE_BRANCH_USER = 1 << 0 分支目标位于用户空间 -- Gitee From 7ac9c0b4f3bbf6ced2e721d7007ba8ed8714eb7a Mon Sep 17 00:00:00 2001 From: wuying39 <921169248@qq.com> Date: Mon, 26 May 2025 09:38:04 +0800 Subject: [PATCH 10/12] corrected spelling errors --- docs/Python_API.md | 2 +- go/src/libkperf_test/libkperf_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Python_API.md b/docs/Python_API.md index 3afa10f..9ef8703 100644 --- a/docs/Python_API.md +++ b/docs/Python_API.md @@ -329,7 +329,7 @@ for func_name in kperf.sys_call_func_list(): kperf.device_open(dev_attr: List[PmuDeviceAttr]) 初始化采集uncore事件指标的能力 * class PmuDeviceAttr: - * metic: 指定需要采集的指标 + * metric: 指定需要采集的指标 * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes diff --git a/go/src/libkperf_test/libkperf_test.go b/go/src/libkperf_test/libkperf_test.go index 2f55951..e64ea6a 100644 --- a/go/src/libkperf_test/libkperf_test.go +++ b/go/src/libkperf_test/libkperf_test.go @@ -22,7 +22,7 @@ func TestCount(t *testing.T) { } for _, o := range dataVo.GoData { - t.Logf("================================Get Couting data success================================") + t.Logf("================================Get Counting data success================================") t.Logf("count base info comm=%v, evt=%v, pid=%v, tid=%v, coreId=%v, numaId=%v, sockedId=%v", o.Comm, o.Evt, o.Pid, o.Tid, o.CpuTopo.CoreId, o.CpuTopo.NumaId, o.CpuTopo.SocketId) t.Logf("count info count=%v, countPercent=%v", o.Count, o.CountPercent) } -- Gitee From 433eeaf03cd71e8da9555b55cacf470c63b6c927 Mon Sep 17 00:00:00 2001 From: wuying39 <921169248@qq.com> Date: Mon, 26 May 2025 11:00:45 +0800 Subject: [PATCH 11/12] collected spelling error --- docs/Go_API.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Go_API.md b/docs/Go_API.md index fa1ba8f..bb53343 100644 --- a/docs/Go_API.md +++ b/docs/Go_API.md @@ -334,7 +334,7 @@ func main() { func PmuDeviceOpen(attr []PmuDeviceAttr) (int, error) 初始化采集uncore事件指标的能力 * type PmuDeviceAttr struct: - * Metic: 指定需要采集的指标 + * Metric: 指定需要采集的指标 * PMU_DDR_READ_BW 采集每个numa的ddrc的读带宽,单位:Bytes * PMU_DDR_WRITE_BW 采集每个numa的ddrc的写带宽,单位:Bytes * PMU_L3_TRAFFIC 采集每个core的L3的访问字节数,单位:Bytes -- Gitee From d50a9fb22144c9936f7beaa1d7a78b36e1f86b7f Mon Sep 17 00:00:00 2001 From: twwang <920347125@qq.com> Date: Wed, 28 May 2025 15:01:46 +0800 Subject: [PATCH 12/12] =?UTF-8?q?PMU=5FL3=5FLAT=E7=9A=84=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E4=BD=BF=E7=94=A8ns=E4=BB=A3=E6=9B=BFcycles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/Details_Usage.md | 14 +++++++------- include/pmu.h | 2 +- pmu/pmu_metric.cpp | 16 +++++++++++++++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/docs/Details_Usage.md b/docs/Details_Usage.md index 815f4d8..7530e67 100644 --- a/docs/Details_Usage.md +++ b/docs/Details_Usage.md @@ -710,7 +710,7 @@ auto len = PmuGetDevMetric(oriData, oriLen, devAttr, 1, &devData); // devData的长度等于cluster个数 for (int i=0;i L3_CLOCK_NS {{CHIP_TYPE::HIPB, 0.3448275862}}; + const UNCORE_METRIC_MAP UNCORE_METRIC_CONFIG_MAP = { {CHIP_TYPE::HIPA, HIP_A_UNCORE_METRIC_MAP}, {CHIP_TYPE::HIPB, HIP_B_UNCORE_METRIC_MAP}, @@ -931,6 +933,17 @@ namespace KUNPENG_PMU { return 64 * rawCount; } + static uint64_t L3Lat(const uint64_t rawCount) + { + const CHIP_TYPE chipType = GetCpuType(); + auto iter = L3_CLOCK_NS.find(chipType); + uint64_t count = rawCount; + if (iter != L3_CLOCK_NS.end()) { + count = rawCount * iter->second; + } + return count; + } + static PmuMetricMode GetMetricMode(const PmuDeviceMetric &metric) { switch(metric) { @@ -1144,7 +1157,8 @@ namespace KUNPENG_PMU { map computeMetricMap = {{PMU_DDR_READ_BW, DDRBw}, {PMU_DDR_WRITE_BW, DDRBw}, - {PMU_L3_TRAFFIC, L3Bw}}; + {PMU_L3_TRAFFIC, L3Bw}, + {PMU_L3_LAT, L3Lat}}; map aggregateMap = { {PMU_DDR_READ_BW, AggregateByNuma}, {PMU_DDR_WRITE_BW, AggregateByNuma}, -- Gitee