From de6a3e01612cf7402831c983e535108f38eb4c55 Mon Sep 17 00:00:00 2001 From: 18868120292 Date: Fri, 24 Nov 2023 18:21:29 +0800 Subject: [PATCH] jupyter notebook of cluster --- profiler/advisor/cluster_perf_analysis.ipynb | 614 +++++++++++++++++++ 1 file changed, 614 insertions(+) create mode 100644 profiler/advisor/cluster_perf_analysis.ipynb diff --git a/profiler/advisor/cluster_perf_analysis.ipynb b/profiler/advisor/cluster_perf_analysis.ipynb new file mode 100644 index 0000000000..0f59fc0c3e --- /dev/null +++ b/profiler/advisor/cluster_perf_analysis.ipynb @@ -0,0 +1,614 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "initial_id", + "metadata": { + "ExecuteTime": { + "end_time": "2023-11-21T13:31:25.022339600Z", + "start_time": "2023-11-21T13:31:25.016155200Z" + } + }, + "outputs": [], + "source": [ + "from advisor_backend.interface import Interface\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "57d17a21205c3c5e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# 集群调优分析\n", + "## 1. 集群分析的数据准备\n", + "首先我们当前支持PyTorch多卡大模型的集群分析,您需要输入集群分析的profiling_path路径,例如:\n", + "--{profiling_path}\n", + " -- xxxx_ascend_pt\n", + " -- xxxx_ascend_pt\n", + " -- xxxx_ascend_pt\n", + " ......\n", + " -- xxxx_ascend_pt\n", + "里面每张卡的profiling文件都是ascend_pt结尾的文件。\n", + "\n", + "## 2. 集群分析解决的问题\n", + "当前的功能主要有三项:\n", + "1). 识别多卡间的计算慢卡(根据计算时间等推断)\n", + "2). 识别多卡间的通信慢现象(根据通信链路的带宽判断)\n", + "3). 对多卡间的计算算子进行统计展示(识别不同卡的算子差异)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "36b7a24cc7ca5da2", + "metadata": { + "ExecuteTime": { + "end_time": "2023-11-21T12:53:38.379699800Z", + "start_time": "2023-11-21T12:53:38.363755900Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# EDIT THE PROFILING DATA PATH\n", + "cluster_path = \"YOUR PATH\"\n", + "interface = Interface(cluster_path)" + ] + }, + { + "cell_type": "markdown", + "id": "cf832ac2e0dfa30f", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## 1) 识别慢卡" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "40aac93278dd6e34", + "metadata": { + "ExecuteTime": { + "end_time": "2023-11-21T12:53:41.815599700Z", + "start_time": "2023-11-21T12:53:41.783393700Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.\n", + "[INFO]Skip Cluster analyze backend.\n" + ] + } + ], + "source": [ + "dataset = interface.get_data('cluster', 'slow rank')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cd3fceda-49f0-439f-9c54-cc31490fc99e", + "metadata": {}, + "outputs": [], + "source": [ + "# EDIT THE DATA TO SHOW WHAT YOU WANT\n", + "data = dataset.get('data')\n", + "words = dataset.get('bottleneck')\n", + "rank_ids = list(data.keys())\n", + "# 柱状图显示属性\n", + "compute_time = [data.get(key, {})[0] for key in rank_ids]\n", + "communication_time = [data.get(key, {})[1] for key in rank_ids]\n", + "free_time = [data.get(key, {})[2] for key in rank_ids]\n", + "# 柱宽\n", + "width = 0.2\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6a1d82fb-a31b-49ab-a859-6d4bb898c512", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Communication has some issues in the cluster, because the max difference of Communication time has reached 88.476ms. \n", + "Free has some issues in the cluster, because the max difference of Free time has reached 29.224ms. \n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 设置展示图大小\n", + "fig, ax = plt.subplots(figsize=(10,8))\n", + "\n", + "x = np.arange(len(rank_ids)) # the label locations\n", + "\n", + "rects1 = ax.bar(x - width, compute_time, width, label='Computing')\n", + "rects2 = ax.bar(x, communication_time, width, label='Communication')\n", + "rects3 = ax.bar(x + width, free_time, width, label='Free')\n", + "\n", + "\n", + "# Add some text for labels, title and custom x-axis tick labels, etc.\n", + "ax.set_ylabel('Time(us)')\n", + "ax.set_xlabel('Rank ID')\n", + "ax.set_title('Step Time')\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(rank_ids)\n", + "ax.legend()\n", + "print(words)" + ] + }, + { + "cell_type": "markdown", + "id": "3511befaff513e8e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## 2)识别通信链路慢" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2a1e617d2a117125", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.\n", + "[INFO]Skip Cluster analyze backend.\n" + ] + } + ], + "source": [ + "dataset = interface.get_data('cluster', 'slow link')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c8bca314-a8da-4a5b-985a-c36f00154552", + "metadata": {}, + "outputs": [], + "source": [ + "# EDIT THE DATA TO SHOW WHAT YOU WANT\n", + "data = dataset.get('data')\n", + "words = dataset.get('bottleneck')\n", + "rank_ids = list(data.keys())\n", + "# 柱状图显示属性\n", + "sdma_bw = [data.get(key, {}).get(\"SDMA bandwidth(GB/s)\") for key in rank_ids]\n", + "rdma_bw = [data.get(key, {}).get(\"RDMA bandwidth(GB/s)\") for key in rank_ids]\n", + "# 柱宽\n", + "width = 0.4" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "99ef04c9-ec07-4790-bbb6-0de9bf6c99d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RDMA bandwidth(GB/s): \n", + "The average is 0.041, while the maximum is 0.041GB/s and the minimum is 0.041GB/s. the difference is 0.0GB/s. \n", + "SDMA bandwidth(GB/s): \n", + "The average is 0.054, while the maximum is 0.056GB/s and the minimum is 0.052GB/s. the difference is 0.003GB/s. \n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 设置展示图大小\n", + "fig, ax = plt.subplots(figsize=(10,8))\n", + "\n", + "x = np.arange(len(rank_ids)) # the label locations\n", + "\n", + "rects1 = ax.bar(x - width/2, sdma_bw, width, label='SDMA')\n", + "rects2 = ax.bar(x + width/2, rdma_bw, width, label='RDMA')\n", + "\n", + "# Add some text for labels, title and custom x-axis tick labels, etc.\n", + "ax.set_ylabel('Bandwidth(GB/s)')\n", + "ax.set_xlabel('Rank ID')\n", + "ax.set_title('Transport Bandwidth')\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(rank_ids)\n", + "ax.legend()\n", + "print(words)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "77d6efa1-48e3-409f-82c4-3e2b3d868898", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RDMA bandwidth(GB/s): \n", + "The average is 0.041, while the maximum is 0.041GB/s and the minimum is 0.041GB/s. the difference is 0.0GB/s. \n", + "SDMA bandwidth(GB/s): \n", + "The average is 0.054, while the maximum is 0.056GB/s and the minimum is 0.052GB/s. the difference is 0.003GB/s. \n", + "\n" + ] + } + ], + "source": [ + "print(dataset.get('bottleneck'))" + ] + }, + { + "cell_type": "markdown", + "id": "ce27a1d3-1354-45f7-88d8-dcb8e438b2b2", + "metadata": {}, + "source": [ + "## 3) 分布式卡上的kernel算子统计展示" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e05774e9-c47e-400f-8421-b4b71bcdcbc4", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = interface.get_data('cluster', 'kernel')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e95b6849-1738-4975-929f-734edff5d1c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rank idNameInput ShapesInput Data TypesOutput ShapesDuration(us)_meanDuration(us)_varDuration(us)_maxDuration(us)_minDuration(us)_countDuration(us)_sum
00Add\"1024,2,5120;1024,2,5120\"DT_BF16;DT_BF16\"1024,2,5120\"45.01205082.95274855.925535.310816720.1928
10Add\"2,8192,5120;2,8192,5120\"DT_BF16;DT_BF16\"2,8192,5120\"447.183700NaN447.1837447.18371447.1837
20Add\"8192,2,1920;1920\"DT_BF16;DT_BF16\"8192,2,1920\"54.3308501.34284655.245652.64634217.3234
30Add\"8192,2,2560;2560\"DT_BF16;DT_BF16\"8192,2,2560\"75.4853750.76131576.280274.24074301.9415
40Add\";\"FLOAT;FLOAT\"\"1.2008840.0172571.49960.95975060.0442
....................................
144115atomic_memset-1_67_1998432_1_0\"\"UNDEFINED\"\"3.160000NaN3.16003.160013.1600
144215trans_Cast_14\"1\"FLOAT\"1\"1.3900000.0230671.60001.260045.5600
144315trans_Cast_15\"\"INT32\"\"64.44500036.27610070.300059.20004257.7800
144415trans_Cast_4\"1\"FLOAT\"1\"1.5550000.0358571.94001.3200812.4400
144515trans_Cast_5\"\"INT32\"\"62.89500015.58420069.860056.76008503.1600
\n", + "

1446 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " rank id Name Input Shapes \\\n", + "0 0 Add \"1024,2,5120;1024,2,5120\" \n", + "1 0 Add \"2,8192,5120;2,8192,5120\" \n", + "2 0 Add \"8192,2,1920;1920\" \n", + "3 0 Add \"8192,2,2560;2560\" \n", + "4 0 Add \";\" \n", + "... ... ... ... \n", + "1441 15 atomic_memset-1_67_1998432_1_0 \"\" \n", + "1442 15 trans_Cast_14 \"1\" \n", + "1443 15 trans_Cast_15 \"\" \n", + "1444 15 trans_Cast_4 \"1\" \n", + "1445 15 trans_Cast_5 \"\" \n", + "\n", + " Input Data Types Output Shapes Duration(us)_mean Duration(us)_var \\\n", + "0 DT_BF16;DT_BF16 \"1024,2,5120\" 45.012050 82.952748 \n", + "1 DT_BF16;DT_BF16 \"2,8192,5120\" 447.183700 NaN \n", + "2 DT_BF16;DT_BF16 \"8192,2,1920\" 54.330850 1.342846 \n", + "3 DT_BF16;DT_BF16 \"8192,2,2560\" 75.485375 0.761315 \n", + "4 FLOAT;FLOAT \"\" 1.200884 0.017257 \n", + "... ... ... ... ... \n", + "1441 UNDEFINED \"\" 3.160000 NaN \n", + "1442 FLOAT \"1\" 1.390000 0.023067 \n", + "1443 INT32 \"\" 64.445000 36.276100 \n", + "1444 FLOAT \"1\" 1.555000 0.035857 \n", + "1445 INT32 \"\" 62.895000 15.584200 \n", + "\n", + " Duration(us)_max Duration(us)_min Duration(us)_count Duration(us)_sum \n", + "0 55.9255 35.3108 16 720.1928 \n", + "1 447.1837 447.1837 1 447.1837 \n", + "2 55.2456 52.6463 4 217.3234 \n", + "3 76.2802 74.2407 4 301.9415 \n", + "4 1.4996 0.9597 50 60.0442 \n", + "... ... ... ... ... \n", + "1441 3.1600 3.1600 1 3.1600 \n", + "1442 1.6000 1.2600 4 5.5600 \n", + "1443 70.3000 59.2000 4 257.7800 \n", + "1444 1.9400 1.3200 8 12.4400 \n", + "1445 69.8600 56.7600 8 503.1600 \n", + "\n", + "[1446 rows x 11 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "27b75df4-792b-43dc-aa5c-d3c265642c1e", + "metadata": {}, + "outputs": [], + "source": [ + "# 保存到csv查看, 可修改保存路径\n", + "dataset.to_csv('cluster_kernel_details.csv', index=False, sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3b0afac-4b79-46a5-bce4-c17ebf690a38", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- Gitee