From 8783e2ac7cc6d3634c53174401f50e18e4a19e83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E5=AE=9D?= Date: Mon, 9 Dec 2024 17:11:39 +0800 Subject: [PATCH 1/8] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=AD=E6=96=87?= =?UTF-8?q?=E5=BB=BA=E8=AE=AE=E7=9A=84yaml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/advisor/rules/cn/__init__.py | 0 profiler/advisor/rules/cn/aicpu_rules.yaml | 103 ++++++++++++++++++ .../rules/cn/bandwidth_contention.yaml | 7 ++ profiler/advisor/rules/cn/byte_alignment.yaml | 6 + profiler/advisor/rules/cn/dataloader.yaml | 8 ++ .../rules/cn/environment_variable_info.yaml | 43 ++++++++ profiler/advisor/rules/cn/gc.yaml | 15 +++ profiler/advisor/rules/cn/memory.yaml | 14 +++ profiler/advisor/rules/cn/packet.yaml | 14 +++ profiler/advisor/rules/cn/rdma_analysis.yaml | 7 ++ profiler/advisor/rules/cn/sync_batchnorm.yaml | 41 +++++++ profiler/advisor/rules/cn/synchronize.yaml | 5 + profiler/advisor/rules/en/__init__.py | 0 .../advisor/rules/{ => en}/aicpu_rules.yaml | 0 .../rules/{ => en}/bandwidth_contention.yaml | 2 +- .../rules/{ => en}/byte_alignment.yaml | 0 .../advisor/rules/{ => en}/dataloader.yaml | 14 +-- .../{ => en}/environment_variable_info.yaml | 0 profiler/advisor/rules/{ => en}/gc.yaml | 0 profiler/advisor/rules/{ => en}/memory.yaml | 26 ++--- profiler/advisor/rules/{ => en}/packet.yaml | 0 .../advisor/rules/{ => en}/rdma_analysis.yaml | 0 .../rules/{ => en}/sync_batchnorm.yaml | 80 +++++++------- .../advisor/rules/{ => en}/synchronize.yaml | 8 +- .../advisor/rules/timeline_fusion_ops.yaml | 12 -- 25 files changed, 328 insertions(+), 77 deletions(-) create mode 100644 profiler/advisor/rules/cn/__init__.py create mode 100644 profiler/advisor/rules/cn/aicpu_rules.yaml create mode 100644 profiler/advisor/rules/cn/bandwidth_contention.yaml create mode 100644 profiler/advisor/rules/cn/byte_alignment.yaml create mode 100644 profiler/advisor/rules/cn/dataloader.yaml create mode 100644 profiler/advisor/rules/cn/environment_variable_info.yaml create mode 100644 profiler/advisor/rules/cn/gc.yaml create mode 100644 profiler/advisor/rules/cn/memory.yaml create mode 100644 profiler/advisor/rules/cn/packet.yaml create mode 100644 profiler/advisor/rules/cn/rdma_analysis.yaml create mode 100644 profiler/advisor/rules/cn/sync_batchnorm.yaml create mode 100644 profiler/advisor/rules/cn/synchronize.yaml create mode 100644 profiler/advisor/rules/en/__init__.py rename profiler/advisor/rules/{ => en}/aicpu_rules.yaml (100%) rename profiler/advisor/rules/{ => en}/bandwidth_contention.yaml (80%) rename profiler/advisor/rules/{ => en}/byte_alignment.yaml (100%) rename profiler/advisor/rules/{ => en}/dataloader.yaml (99%) rename profiler/advisor/rules/{ => en}/environment_variable_info.yaml (100%) rename profiler/advisor/rules/{ => en}/gc.yaml (100%) rename profiler/advisor/rules/{ => en}/memory.yaml (99%) rename profiler/advisor/rules/{ => en}/packet.yaml (100%) rename profiler/advisor/rules/{ => en}/rdma_analysis.yaml (100%) rename profiler/advisor/rules/{ => en}/sync_batchnorm.yaml (98%) rename profiler/advisor/rules/{ => en}/synchronize.yaml (99%) diff --git a/profiler/advisor/rules/cn/__init__.py b/profiler/advisor/rules/cn/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/profiler/advisor/rules/cn/aicpu_rules.yaml b/profiler/advisor/rules/cn/aicpu_rules.yaml new file mode 100644 index 000000000..4d69b0642 --- /dev/null +++ b/profiler/advisor/rules/cn/aicpu_rules.yaml @@ -0,0 +1,103 @@ +DataTypeSuggeation: &DataTypeSuggeation "数据类型{}在{}算子中可能会造成AICpu问题, 如果可以,尝试转换成{}。" +AICPU_DOC_URL: &AICPU_DOC_URL "https://gitee.com/ascend/mstt/blob/master/profiler/advisor/doc/Samples%20of%20AI%20CPU%20Operator%20Replacement.md" + +CommonChecker: + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ __ALL__ ] + ignore_type: [ cast, tensorequal, equal, nonzero, mul ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, int16, uint16, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, int16, uint16, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ cast ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ tensorequal ] + input: [ float, float32, float16, bool, int32, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ equal ] + input: [ float, float32, float16, bool, int32, int64, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ nonzero ] + input: [ float16, bool, dt_bf16 ] + output: [ int64 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ mul ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.RC1, 7.0.0] + op_type: [ __ALL__ ] + ignore_type: [ cast, tensorequal, equal, nonzero, mul ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, int16, complex64, complex128 ] + output: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, int16, complex64, complex128 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.RC1, 7.0.0] + op_type: [ cast ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.RC1, 7.0.0] + op_type: [ tensorequal ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.RC1, 7.0.0] + op_type: [ equal ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.RC1, 7.0.0] + op_type: [ mul ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, complex64 ] + output: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, complex64 ] + suggestion: *DataTypeSuggeation + +ExampleGuideChecker: + - IndexPutChecker: + op_type: [index] + url: *AICPU_DOC_URL + suggestion: "请参考链接修改源码,尝试用等价的算子替换index算子。" + + - NonzeroChecker: + op_type: [ indexput, indexputv2 ] + url: *AICPU_DOC_URL + suggestion: "请参考链接修改源码,尝试用等价的算子替换indexput算子。" + + - CastChecker: + op_type: [ argmin ] + url: *AICPU_DOC_URL + suggestion: "请参考链接更新cann-tookit包到7.0.RC1及以上的版本。" + + - CastChecker: + op_type: [ nonzero ] + url: *AICPU_DOC_URL + suggestion: "请参考链接修改源码,尝试用等价的算子替换nonzero算子。" diff --git a/profiler/advisor/rules/cn/bandwidth_contention.yaml b/profiler/advisor/rules/cn/bandwidth_contention.yaml new file mode 100644 index 000000000..52a85b5ab --- /dev/null +++ b/profiler/advisor/rules/cn/bandwidth_contention.yaml @@ -0,0 +1,7 @@ +problem: "在执行计算和通信任务时,SDMA带宽低于 {threshold}GB/s。通常,并行计算和通信可以提高模型的运行效率。并发计算和通信任务可能会影响通信带宽。" +sdma_baseline: 18 #M +threshold: 0.8 +top_num: 10 +solutions: + - 基线数据对比: + desc: "比较启用融合功能之前和之后的性能数据,以评估是否带宽抢占的影响超过了融合的收益。" diff --git a/profiler/advisor/rules/cn/byte_alignment.yaml b/profiler/advisor/rules/cn/byte_alignment.yaml new file mode 100644 index 000000000..a653d5632 --- /dev/null +++ b/profiler/advisor/rules/cn/byte_alignment.yaml @@ -0,0 +1,6 @@ +problem: "{count}个通信算子的数据大小未对齐,这会降低通信性能。" +min_size: 512 # byte +top_num: 5 +solutions: + - 调整数据大小: + desc: "请调整数据大小,对齐通信算子的数据量,具体方法请联系相关HCCL研发。" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/dataloader.yaml b/profiler/advisor/rules/cn/dataloader.yaml new file mode 100644 index 000000000..12c8a36eb --- /dev/null +++ b/profiler/advisor/rules/cn/dataloader.yaml @@ -0,0 +1,8 @@ +dataloader_duration_threshold: 10000 # us +problem: "dataloader数据加载速度较慢,一次迭代花费{dataloader_duration}us,通常小于{dataloader duration_threshold}us。" +solutions: + - "请检查数据目录的磁盘I/O。如果您正在ModelArts中训练模型,请将数据移动到“/cache”或装载更高效的云磁盘以获得更好的I/O。" + - "请检查运行时是否有任何其他可能影响数据加载的多进程操作,例如启动训练任务时使用命令'taskset…'绑定训练进程核。" + - "请检查数据的格式,避免使用tar、tar.gz、zip等文件格式。" + - "请为dataloader设置'pin_memory=True'。" + - "尝试调整dataloader参数'num_workers'。" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/environment_variable_info.yaml b/profiler/advisor/rules/cn/environment_variable_info.yaml new file mode 100644 index 000000000..d342612db --- /dev/null +++ b/profiler/advisor/rules/cn/environment_variable_info.yaml @@ -0,0 +1,43 @@ +ASCEND_GLOBAL_LOG_LEVEL: + desc: "日志级别: 0-调试,1-信息,2-警告,3-错误。\n + 默认为错误级别。" + suggest: "调试或信息级别可能会导致培训性能下降,\n + 建议通过执行命令'export ASCEND_GLOBAL_LOGLEVEL=3来设置错误级别。" +HCCL_RDAM_TC: + desc: "配置网络端口发送的RoCE数据包的DSCP值。\n + 在IP数据报头的DS字段中,最右侧的6位是DSCP,最左侧的2位是0。\n + 应将其设置为DSCP*4。默认值为132,即DSCP为33(132=33*4)。" + suggest: "请参考 https://support.huawei.com/enterprise/zh/doc/EDOC1100371278/5eeeed85?idPath=23710424" + suggest_html: "请参考 链接" +HCCL_RDMA_SL: + desc: "指定RDMA NIC的优先级。\n + 该值必须与NIC的PFC优先级相同。\n + 否则,性能可能会恶化。\n + 取值范围为[0,7],默认值为4。" + suggest: "请参考 https://support.huawei.com/enterprise/zh/doc/EDOC1100371278/5eeeed85?idPath=23710424" + suggest_html: "请参考 链接" +ACLNN_CACHE_LIMIT: + desc: "缓存的aclnn算子的数量。" + suggest: "在alcnn和host耗时过长时,可以设置一个较大的数字,例如'export ACLNN_CACHE_LIMIT=100000'。" +HOST_CACHE_CAPACITY: + desc: "启用动态shape缓存。\n + 默认值为0,表示数据缓存已禁用。\n + 如果设置为非零正整数,例如10,系统将缓存最近频繁出现的10个输入形状的执行数据。\n + 当缓存的形状再次出现时,host执行性能将得到提高,但host内存使用量会增加。\n + 具体的增加与HOST_CACHE_CAPACITY的值和模型的大小成正比。" + suggest: "设置一个非零数字,例如'export HOST_CACHE_CAPACITY=20'" +ASCEND_ENHANCE_ENABLE: + desc: "启用hccl ffts+模式。0-禁用,1-启用。" + suggest: "建议通过执行命令'export ASCEND_ENHANCE_enable=1'启用hccl ffts+模式。" +PYTORCH_NPU_ALLOC_CONF: + desc: "控制缓存分配器的行为。\n + 可选参数为max_split_size_mb、garbage_collection_threshold和expandable_segments。\n + 1.max_split_size_mb:v —— 大于v的内存块不会被分割。\n + 2.garbage_collection_threshold:t —— 设置阈值后,如果NPU内存使用量超过阈值,缓存分配器将开始回收内存块。t的取值范围为(0.0,1.0)。\n + 3.expandable_segments:True/False —— 默认值为False。如果为True,则此设置指示缓存分配器创建特定的内存块,这些内存块可以在以后扩展,以更好地处理频繁更改的内存使用情况。" + suggest: "export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True" +ASCEND_LAUNCH_BLOCKING: + desc: "是否在操作执行期间启用同步模式。\n + 当设置为1时,强制算子同步运行,从而更容易调试和跟踪代码中的问题。\n + 如果设置为0,则任务将以异步模式执行。" + suggest: "export ASCEND_LAUNCH_BLOCKING=1" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/gc.yaml b/profiler/advisor/rules/cn/gc.yaml new file mode 100644 index 000000000..78238f0d3 --- /dev/null +++ b/profiler/advisor/rules/cn/gc.yaml @@ -0,0 +1,15 @@ +gc_problem_with_count: "检测到异常垃圾收集(GC)事件{gc_count}次,总时间为{gc_total_time}毫秒。\n + GC操作耗时且会阻塞整个过程。因此,模型训练过程中的某些步骤比其他步骤需要更长的时间。" +gc_problem_with_free: "由于torch_npu的版本较低,在分析时没有收集垃圾收集(GC)数据。但在{free_duration_time}微秒(us)的空闲时间内几乎没有主机任务,这可能是由Python的异常GC引起的。" +gc_threshold: 1000 #us +top_num: 10 +max_free_threshold: 200000 # us +max_acl_event_num_ratio: 0.0001 # max 10 events per 100 ms +max_acl_event_time_ratio: 0.01 # total time of acl events no larger than 0.01 * free duration +solutions: + - 内存管理: + desc: "实现有效的Python内存管理;不使用时及时释放内存,避免长期保留;避免对象之间的循环引用。" + - 调整GC阈值: + desc: "使用gc.set_threshold()调整垃圾收集阈值可能会延迟垃圾收集,但这是一个临时解决方案。" + - 关闭GC: + desc: "使用gc.disable()禁用gc,注意这是一个临时解决方案。" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/memory.yaml b/profiler/advisor/rules/cn/memory.yaml new file mode 100644 index 000000000..6e789a545 --- /dev/null +++ b/profiler/advisor/rules/cn/memory.yaml @@ -0,0 +1,14 @@ +problem: "{memory_op_nam} {memory-op_name},花费{memory_op_dur}us,这将导致大量的空闲时间。" +max_total_duration: 10000 #us +solutions: + - AscendCL@aclMallocMemInner: + desc: + - "请通过命令'export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True'设置环境变量,然后开始训练任务。" + - AscendCL@aclrtFreePhysical: + desc: + - "在训练时执行'npu-smi info'观察HBM使用情况,如果达到HBM使用的最大值,请减小您的批大小/微批大小。" + - "首先使用参数'with_stack=True'采集Profiling。然后在trace_view.json中搜索'empty_cache'或'emptyCache'。如果存在,根据trace_view.json中相关事件的调用堆栈删除'torch.cuda.empty_cache()'或 'torch.npu.empty_cache()'等代码。" + - AscendCL@aclrtFree: + desc: + - "在训练时执行'npu-smi info'观察HBM使用情况,如果达到HBM使用的最大值,请减小您的批大小/微批大小。" + - "首先使用参数'with_stack=True'采集Profiling。然后在trace_view.json中搜索'empty_cache'或'emptyCache'。如果存在,根据trace_view.json中相关事件的调用堆栈删除'torch.cuda.empty_cache()'或 'torch.npu.empty_cache()'等代码。" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/packet.yaml b/profiler/advisor/rules/cn/packet.yaml new file mode 100644 index 000000000..2b0fff4b7 --- /dev/null +++ b/profiler/advisor/rules/cn/packet.yaml @@ -0,0 +1,14 @@ +problem: "过小的通信数据包可能会导致host传递瓶颈。\n" +sdma_problem: "在SDMA通信中,通信数据量的{abnormal_ratio}小于{min_size}MB,总时间为{abnormal_time}ms。\n" +rdma_problem: "在RDMA通信中,通信数据量的{abnormal_ratio}小于{min_size}MB,总时间为{abnormal_time}ms。\n" +min_sdma_size: 16 #M +min_rdma_size: 1 #M +min_sdma_ratio: 0.2 +min_rdma_ratio: 0.2 +solutions: + - 数据并行建议: + desc: "如果异常通信集中在数据并行域,1.增加批量大小;2.增加梯度累积。" + - 检查内存优化策略: + desc: "如果内存优化策略为Zero3,建议在内存条件允许的情况下将其设置为Zero2/Zero1。" + - 适配亲和优化器或融合算子: + desc: "使用亲和优化器或融合算子可以减少通信算子的数量。" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/rdma_analysis.yaml b/profiler/advisor/rules/cn/rdma_analysis.yaml new file mode 100644 index 000000000..c5a7bd14f --- /dev/null +++ b/profiler/advisor/rules/cn/rdma_analysis.yaml @@ -0,0 +1,7 @@ +problem: "发生RDMA通信重传。单次重传需要4秒以上。重传问题在{group_count}通信域中检测到。\n建议执行以下建议。" +min_retransmission_time: 4000 #ms +solutions: + - 检查RDMA传输时长: + desc: "检查怀疑要重新传输的RDMA算子的传输时间是否正确。" + - 检查网络配置: + desc: "检查交换机和计算节点服务器的网络配置。" \ No newline at end of file diff --git a/profiler/advisor/rules/cn/sync_batchnorm.yaml b/profiler/advisor/rules/cn/sync_batchnorm.yaml new file mode 100644 index 000000000..556f244b5 --- /dev/null +++ b/profiler/advisor/rules/cn/sync_batchnorm.yaml @@ -0,0 +1,41 @@ +problem: "发现{syncbn_num} 个SyncBatchNorm,这可能会导致python任务调度缓慢,设备之间通信频繁,降低训练性能。" +max_syncbn_num: 20 +solutions: + - 使能batchnorm: + desc: "可以通过删除像'torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)'这样的代码,禁用SyncBatchNorm。" + - 使能高效的SyncBatchNorm: + desc: "用以下代码替换运行时环境中python脚本'torch_npu/utils/syncbatchnorm.py'的'forward'方法。" + efficient_code: | + @staticmethod + def forward(self, input_tensor, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size): + input_tensor = input_tensor.contiguous() + input_shape = input_tensor.shape + input_tensor_ = input_tensor.reshape(input_shape[0], input_shape[1], 1, -1) + sum_val, sum_square_val = torch.batch_norm_reduce(input_tensor_, eps) + + count = torch.full((1,), + input_tensor.numel() // input_tensor.size(1), + dtype=sum_val.dtype, + device=sum_val.device) + + num_channels = input_tensor.shape[1] + combined = torch.cat([sum_val, sum_square_val, count], dim=0) + combined_list = torch.empty((world_size,) + combined.shape, dtype=combined.dtype, device=combined.device) + dist.all_gather_togather(combined_list, combined, process_group, async_op=False) + sum_all, square_sum_all, count_all = torch.split(combined_list, num_channels, dim=1) + size = count_all.view(-1).sum() + if size == 1: + raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) + + mean, invstd = torch.batch_norm_gather_stats_update(input_tensor, + sum_all, + square_sum_all, + running_mean, + running_var, + momentum, + eps, + count_all.view(-1)) + self.save_for_backward(input_tensor, weight, mean, invstd, count_all.to(torch.int32)) + self.process_group = process_group + out = torch.batch_norm_elemt(input_tensor, weight, bias, mean, invstd, eps) + return out \ No newline at end of file diff --git a/profiler/advisor/rules/cn/synchronize.yaml b/profiler/advisor/rules/cn/synchronize.yaml new file mode 100644 index 000000000..d7535cb1e --- /dev/null +++ b/profiler/advisor/rules/cn/synchronize.yaml @@ -0,0 +1,5 @@ +problem: "SynchronizeStream会降低训练效率。发现{synchronize_num}个SynchronizeStreams和{node_launch_num}个NodeLaunch,SynchronizeStream和NodeLaunch的共现率为{co_occur_ratio}。" +min_co_occurrence_ratio: 0.5 +solutions: + - 关闭算子同步特性: + desc: "请检查您的环境变量'ASCEND_LAUNCH_BLOCKING',如果ASCEND_LAUNCH_BLOCKING=1,请执行'unset ASCEND_LAUNCH_BLOCKING',再开始训练任务。" \ No newline at end of file diff --git a/profiler/advisor/rules/en/__init__.py b/profiler/advisor/rules/en/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/profiler/advisor/rules/aicpu_rules.yaml b/profiler/advisor/rules/en/aicpu_rules.yaml similarity index 100% rename from profiler/advisor/rules/aicpu_rules.yaml rename to profiler/advisor/rules/en/aicpu_rules.yaml diff --git a/profiler/advisor/rules/bandwidth_contention.yaml b/profiler/advisor/rules/en/bandwidth_contention.yaml similarity index 80% rename from profiler/advisor/rules/bandwidth_contention.yaml rename to profiler/advisor/rules/en/bandwidth_contention.yaml index 0f25cc370..684ac22e9 100644 --- a/profiler/advisor/rules/bandwidth_contention.yaml +++ b/profiler/advisor/rules/en/bandwidth_contention.yaml @@ -1,4 +1,4 @@ -problem: "The SDMA bandwidth is lower than {threshold} when computing and communication tasks are performed \n +problem: "The SDMA bandwidth is lower than {threshold} GB/s when computing and communication tasks are performed \n concurrently. Generally, parallel computing and communication improves the running efficiency of the model. \n Concurrent computing and communication tasks may affect the communication bandwidth." sdma_baseline: 18 #M diff --git a/profiler/advisor/rules/byte_alignment.yaml b/profiler/advisor/rules/en/byte_alignment.yaml similarity index 100% rename from profiler/advisor/rules/byte_alignment.yaml rename to profiler/advisor/rules/en/byte_alignment.yaml diff --git a/profiler/advisor/rules/dataloader.yaml b/profiler/advisor/rules/en/dataloader.yaml similarity index 99% rename from profiler/advisor/rules/dataloader.yaml rename to profiler/advisor/rules/en/dataloader.yaml index 4b56be9ae..8cc23cb00 100644 --- a/profiler/advisor/rules/dataloader.yaml +++ b/profiler/advisor/rules/en/dataloader.yaml @@ -1,8 +1,8 @@ -dataloader_duration_threshold: 10000 # us -problem: "Found slow dataloader, cost {dataloader_duration} us for one step while profiling, normally less than {dataloader_duration_threshold} us." -solutions: - - "Please check the disk I/O of your data directory. If you are training model in ModelArts, please move data to '/cache' or mount a more efficient cloud disk for better I/O." - - "Please check if there are any other multiprocess operations in runtime that may have affected the dataloader, such as training process core binding command 'taskset ...' used for launching the training job." - - "Please check the format of your data, avoid file format like tar, tar.gz, zip." - - "Please set 'pin_memory=True' for your dataloader." +dataloader_duration_threshold: 10000 # us +problem: "Found slow dataloader, cost {dataloader_duration} us for one step while profiling, normally less than {dataloader_duration_threshold} us." +solutions: + - "Please check the disk I/O of your data directory. If you are training model in ModelArts, please move data to '/cache' or mount a more efficient cloud disk for better I/O." + - "Please check if there are any other multiprocess operations in runtime that may have affected the dataloader, such as training process core binding command 'taskset ...' used for launching the training job." + - "Please check the format of your data, avoid file format like tar, tar.gz, zip." + - "Please set 'pin_memory=True' for your dataloader." - "Try to adjust dataloader parameter 'num_workers'." \ No newline at end of file diff --git a/profiler/advisor/rules/environment_variable_info.yaml b/profiler/advisor/rules/en/environment_variable_info.yaml similarity index 100% rename from profiler/advisor/rules/environment_variable_info.yaml rename to profiler/advisor/rules/en/environment_variable_info.yaml diff --git a/profiler/advisor/rules/gc.yaml b/profiler/advisor/rules/en/gc.yaml similarity index 100% rename from profiler/advisor/rules/gc.yaml rename to profiler/advisor/rules/en/gc.yaml diff --git a/profiler/advisor/rules/memory.yaml b/profiler/advisor/rules/en/memory.yaml similarity index 99% rename from profiler/advisor/rules/memory.yaml rename to profiler/advisor/rules/en/memory.yaml index b9f095d07..86eae5051 100644 --- a/profiler/advisor/rules/memory.yaml +++ b/profiler/advisor/rules/en/memory.yaml @@ -1,14 +1,14 @@ -problem: "Found {memory_op_num} {memory_op_name}, cost {memory_op_dur} us, which will lead to large amount of free time." -max_total_duration: 10000 #us -solutions: - - AscendCL@aclMallocMemInner: - desc: - - "Please set env by command 'export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True' and then start your training job" - - AscendCL@aclrtFreePhysical: - desc: - - "Execute 'npu-smi info' to observe the HBM-Usage while training, if reach the maximum of HBM-Usage, please reduce your batch size/micro batch size" - - "Profiling with the parameters 'with_stack=True' firstly. Then search 'empty_cache' or 'emptyCache' in trace_view.json, if exists, remove the code like 'torch.cuda.empty_cache()' or 'torch.npu.empty_cache()' according to the 'call stack' of relevant event in trace_view.json" - - AscendCL@aclrtFree: - desc: - - "Execute 'npu-smi info' to observe the HBM-Usage while training, if reach the maximum of HBM-Usage, please reduce your batch size/micro batch size" +problem: "Found {memory_op_num} {memory_op_name}, cost {memory_op_dur} us, which will lead to large amount of free time." +max_total_duration: 10000 #us +solutions: + - AscendCL@aclMallocMemInner: + desc: + - "Please set env by command 'export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True' and then start your training job" + - AscendCL@aclrtFreePhysical: + desc: + - "Execute 'npu-smi info' to observe the HBM-Usage while training, if reach the maximum of HBM-Usage, please reduce your batch size/micro batch size" + - "Profiling with the parameters 'with_stack=True' firstly. Then search 'empty_cache' or 'emptyCache' in trace_view.json, if exists, remove the code like 'torch.cuda.empty_cache()' or 'torch.npu.empty_cache()' according to the 'call stack' of relevant event in trace_view.json" + - AscendCL@aclrtFree: + desc: + - "Execute 'npu-smi info' to observe the HBM-Usage while training, if reach the maximum of HBM-Usage, please reduce your batch size/micro batch size" - "Profiling with the parameters 'with_stack=True' firstly. Then search 'empty_cache' or 'emptyCache' in trace_view.json, if exists, remove the code like 'torch.cuda.empty_cache()' or 'torch.npu.empty_cache()' according to the 'call stack' of relevant event in trace_view.json" \ No newline at end of file diff --git a/profiler/advisor/rules/packet.yaml b/profiler/advisor/rules/en/packet.yaml similarity index 100% rename from profiler/advisor/rules/packet.yaml rename to profiler/advisor/rules/en/packet.yaml diff --git a/profiler/advisor/rules/rdma_analysis.yaml b/profiler/advisor/rules/en/rdma_analysis.yaml similarity index 100% rename from profiler/advisor/rules/rdma_analysis.yaml rename to profiler/advisor/rules/en/rdma_analysis.yaml diff --git a/profiler/advisor/rules/sync_batchnorm.yaml b/profiler/advisor/rules/en/sync_batchnorm.yaml similarity index 98% rename from profiler/advisor/rules/sync_batchnorm.yaml rename to profiler/advisor/rules/en/sync_batchnorm.yaml index 0f702af6e..d65bcb0d4 100644 --- a/profiler/advisor/rules/sync_batchnorm.yaml +++ b/profiler/advisor/rules/en/sync_batchnorm.yaml @@ -1,41 +1,41 @@ -problem: "Found {syncbn_num} SyncBatchNorm, which can lead to slow python task dispatch and frequent communication between devices and finally reducing training efficiency." -max_syncbn_num: 20 -solutions: - - enable batchnorm: - desc: "disable SyncBatchNorm by remove the code like 'torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)' if possible." - - enable efficient SyncBatchNorm: - desc: "replace the 'forward' method of python script 'torch_npu/utils/syncbatchnorm.py' in your runtime environment." - efficient_code: | - @staticmethod - def forward(self, input_tensor, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size): - input_tensor = input_tensor.contiguous() - input_shape = input_tensor.shape - input_tensor_ = input_tensor.reshape(input_shape[0], input_shape[1], 1, -1) - sum_val, sum_square_val = torch.batch_norm_reduce(input_tensor_, eps) - - count = torch.full((1,), - input_tensor.numel() // input_tensor.size(1), - dtype=sum_val.dtype, - device=sum_val.device) - - num_channels = input_tensor.shape[1] - combined = torch.cat([sum_val, sum_square_val, count], dim=0) - combined_list = torch.empty((world_size,) + combined.shape, dtype=combined.dtype, device=combined.device) - dist.all_gather_togather(combined_list, combined, process_group, async_op=False) - sum_all, square_sum_all, count_all = torch.split(combined_list, num_channels, dim=1) - size = count_all.view(-1).sum() - if size == 1: - raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) - - mean, invstd = torch.batch_norm_gather_stats_update(input_tensor, - sum_all, - square_sum_all, - running_mean, - running_var, - momentum, - eps, - count_all.view(-1)) - self.save_for_backward(input_tensor, weight, mean, invstd, count_all.to(torch.int32)) - self.process_group = process_group - out = torch.batch_norm_elemt(input_tensor, weight, bias, mean, invstd, eps) +problem: "Found {syncbn_num} SyncBatchNorm, which can lead to slow python task dispatch and frequent communication between devices and finally reducing training efficiency." +max_syncbn_num: 20 +solutions: + - enable batchnorm: + desc: "disable SyncBatchNorm by remove the code like 'torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)' if possible." + - enable efficient SyncBatchNorm: + desc: "replace the 'forward' method of python script 'torch_npu/utils/syncbatchnorm.py' in your runtime environment." + efficient_code: | + @staticmethod + def forward(self, input_tensor, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size): + input_tensor = input_tensor.contiguous() + input_shape = input_tensor.shape + input_tensor_ = input_tensor.reshape(input_shape[0], input_shape[1], 1, -1) + sum_val, sum_square_val = torch.batch_norm_reduce(input_tensor_, eps) + + count = torch.full((1,), + input_tensor.numel() // input_tensor.size(1), + dtype=sum_val.dtype, + device=sum_val.device) + + num_channels = input_tensor.shape[1] + combined = torch.cat([sum_val, sum_square_val, count], dim=0) + combined_list = torch.empty((world_size,) + combined.shape, dtype=combined.dtype, device=combined.device) + dist.all_gather_togather(combined_list, combined, process_group, async_op=False) + sum_all, square_sum_all, count_all = torch.split(combined_list, num_channels, dim=1) + size = count_all.view(-1).sum() + if size == 1: + raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) + + mean, invstd = torch.batch_norm_gather_stats_update(input_tensor, + sum_all, + square_sum_all, + running_mean, + running_var, + momentum, + eps, + count_all.view(-1)) + self.save_for_backward(input_tensor, weight, mean, invstd, count_all.to(torch.int32)) + self.process_group = process_group + out = torch.batch_norm_elemt(input_tensor, weight, bias, mean, invstd, eps) return out \ No newline at end of file diff --git a/profiler/advisor/rules/synchronize.yaml b/profiler/advisor/rules/en/synchronize.yaml similarity index 99% rename from profiler/advisor/rules/synchronize.yaml rename to profiler/advisor/rules/en/synchronize.yaml index efaa8a828..6315afa1e 100644 --- a/profiler/advisor/rules/synchronize.yaml +++ b/profiler/advisor/rules/en/synchronize.yaml @@ -1,5 +1,5 @@ -problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream and {node_launch_num} NodeLaunch, the co-occurrence ratio of SynchronizeStream and NodeLaunch is {co_occur_ratio}" -min_co_occurrence_ratio: 0.5 -solutions: - - disable ascend launch blocking: +problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream and {node_launch_num} NodeLaunch, the co-occurrence ratio of SynchronizeStream and NodeLaunch is {co_occur_ratio}" +min_co_occurrence_ratio: 0.5 +solutions: + - disable ascend launch blocking: desc: "please check your env 'ASCEND_LAUNCH_BLOCKING', if ASCEND_LAUNCH_BLOCKING=1, please execute 'unset ASCEND_LAUNCH_BLOCKING' and then start your training job." \ No newline at end of file diff --git a/profiler/advisor/rules/timeline_fusion_ops.yaml b/profiler/advisor/rules/timeline_fusion_ops.yaml index 3337c9386..46e02fef7 100644 --- a/profiler/advisor/rules/timeline_fusion_ops.yaml +++ b/profiler/advisor/rules/timeline_fusion_ops.yaml @@ -45,18 +45,6 @@ "(slice|chunk)-mul-mul-sigmoid" ] - cann_version: 8.0.RC1 - torch_version: [1.11.0, 2.1.0] - unique_id: 3 - inherit_unique_id: 2 - operator_rules: - aten: - add: - torch_npu.npu_geglu: [ "(slice|chunk)-gelu-mul", "(slice|chunk)-mul-gelu" ] - torch_npu.npu_group_norm_silu: [ "group_norm-silu" ] - torch.addmm: [ "mul-mul-add" ] - torch_npu.npu_add_layer_norm: [ "add-layer_norm" ] - -- cann_version: 8.0.0 torch_version: [1.11.0, 2.1.0] unique_id: 3 inherit_unique_id: 2 -- Gitee From 76df06d858c91f0fc3c6f36920f878ff3675ec2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E5=AE=9D?= Date: Mon, 9 Dec 2024 19:21:47 +0800 Subject: [PATCH 2/8] =?UTF-8?q?=E9=80=82=E9=85=8D=E4=B8=AD=E6=96=87?= =?UTF-8?q?=E5=BB=BA=E8=AE=AEyaml=E4=BB=A5=E5=8F=8A=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../overall_advice/overall_summary_advice.py | 71 +- .../Communication_retransmission_checker.py | 19 +- .../analyzer/cluster/slow_link_analyzer.py | 21 +- .../analyzer/cluster/slow_rank_analyzer.py | 22 +- .../alignment/byte_alignment_checker.py | 30 +- .../bandwidth_contention_checker.py | 26 +- .../communication/packet/packet_checker.py | 16 +- .../communication_retransmission_checker.py | 16 +- .../ai_core_freq/ai_core_freq_checker.py | 19 +- .../computation/aicpu/aicpu_checker.py | 17 +- .../computation/bound/block_dim_checker.py | 21 +- .../bound/operator_bound_checker.py | 3 + .../op_compile/dynamic_shape_checker.py | 39 +- .../analyzer/computation/operator_checker.py | 62 +- .../analyzer/dataloader/dataloader_checker.py | 3 + .../graph_fusion/graph_fusion_checker.py | 20 +- .../overall/environment_variable_checker.py | 19 +- .../overall/overall_summary_analyzer.py | 156 +++- .../dispatch/timeline_op_dispatch_analyzer.py | 31 +- .../fusion_ops/fusion_ops_analyzer.py | 38 +- .../fusion_ops/timeline_api_stack_checker.py | 35 +- .../analyzer/schedule/gc/gc_checker.py | 24 +- .../schedule/syncbn/syncbn_checker.py | 181 ++-- .../synchronize_stream_checker.py | 261 +++--- profiler/advisor/common/analyzer_scopes.py | 76 +- profiler/advisor/common/constant.py | 163 ++++ .../timeline_op_collector.py | 801 +++++++++--------- profiler/advisor/display/html/render.py | 4 +- profiler/advisor/result/result.py | 17 +- profiler/cli/analyze_cli.py | 4 + .../test_dataloader_checker.py | 2 +- .../timeline_advice/test_memory_op_checker.py | 122 +-- .../timeline_advice/test_syncbn_checker.py | 2 +- .../test_synchronize_stream.py | 2 +- 34 files changed, 1462 insertions(+), 881 deletions(-) create mode 100644 profiler/advisor/common/constant.py diff --git a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py index 80a83e586..05e545bef 100644 --- a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py +++ b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py @@ -19,30 +19,57 @@ from profiler.prof_common.constant import Constant from compare_interface.comparison_interface import ComparisonInterface +from profiler.prof_common.additional_args_manager import AdditionalArgsManager + class OverallSummaryAdvice(AdviceBase): - advice_map = { - "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", - "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", - "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." - } - time_name_map = { - "Computing Time": "computing", - "Uncovered Communication Time": "communication", - "Free Time": "free", - 'Cube Time(Num)': 'Cube Time', - 'Vector Time(Num)': 'Vector Time', - 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', - 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', - 'Other Time': "Other Computing Time", - 'SDMA Time(Num)': 'SDMA Time' - } - performance_time_dict = { - "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', - 'Flash Attention Time(Backward)(Num)', 'Other Time'], - "Uncovered Communication Time(Wait Time)": [], - "Free Time": ['SDMA Time(Num)'] - } + language = AdditionalArgsManager().language + if language == "en": + advice_map = { + "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", + "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", + "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." + } + time_name_map = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + performance_time_dict = { + "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', + 'Flash Attention Time(Backward)(Num)', 'Other Time'], + "Uncovered Communication Time(Wait Time)": [], + "Free Time": ['SDMA Time(Num)'] + } + else: + advice_map = { + "计算时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor computation.", + "未被掩盖的通信时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule.", + "空闲时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule." + } + time_name_map = { + "计算时长": "computing", + "未被掩盖的通信时长": "communication", + "空闲时长": "free", + 'Cube算子时长(数量)': 'Cube Time', + 'Vector算子时长(数量)': 'Vector Time', + 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', + 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', + '其它时长': "Other Computing Time", + 'SDMA时长(数量)': 'SDMA Time' + } + performance_time_dict = { + "计算时长": ['Cube时长(数量)', 'Vector时长(数量)', 'Flash Attention时长(前向)(数量)', + 'Flash Attention时长(反向)(数量)', '其它时长'], + "未被掩盖的通信时长(等待时长)": [], + "空闲时长": ['SDMA Time(Num)'] + } def __init__(self, collection_path: str, kwargs: dict): super().__init__(collection_path) diff --git a/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py b/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py index e3c6fa332..56dde89fb 100644 --- a/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py +++ b/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py @@ -19,8 +19,9 @@ from collections import defaultdict from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.prof_common.file_manager import FileManager +from profiler.cluster_analyse.common_func.file_manager import FileManager from profiler.advisor.dataset.cluster.hccl_collection import HcclInfo +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -99,11 +100,19 @@ class CommunicationRetransmissionChecker: """ make record for what and how to optimize """ - optimization_item = OptimizeItem("Communication retransmission analysis", self.desc, self.suggestions) + language = AdditionalArgsManager().language + if language == "en": + problem_str = "Communication retransmission analysis" + else: + problem_str = "通信重传分析" + optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - sub_table_name = \ - "Comm Retransmission Analysis" if not self.stage else f"Stage-{self.stage}: Comm Retransmission Analysis" + if language == "en": + sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" + else: + sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_rdma_list: @@ -120,9 +129,11 @@ class CommunicationRetransmissionChecker: ) def _init_rule(self): + language = AdditionalArgsManager().language syncbn_rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "rules", + language, "rdma_analysis.yaml" ) diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py index a49ba8377..57c4694c0 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py @@ -18,11 +18,12 @@ from typing import Dict, List import logging from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer -from profiler.prof_common.constant import Constant +from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset from profiler.advisor.utils.utils import safe_index_value, convert_to_int +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -86,11 +87,19 @@ class SlowLinkAnalyzer(BaseAnalyzer): logger.info("The slow link (identified bottleneck) cannot provide a bottleneck \ because the analysis data is missing bandwidth information.") return - self.bottelneck += f'{link_type}: \n' \ - f' The average is {avg_bw}, \n' \ - f' while the maximum is {round(max(data_list), 3)}GB/s \n' \ - f' and the minimum is {round(min(data_list), 3)}GB/s. \n' \ - f' the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n' + language = AdditionalArgsManager().language + if language == "en": + self.bottelneck += f'{link_type}: \n' \ + f' The average is {avg_bw}, \n' \ + f' while the maximum is {round(max(data_list), 3)}GB/s \n' \ + f' and the minimum is {round(min(data_list), 3)}GB/s. \n' \ + f' the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n' + else: + self.bottelneck += f'{link_type}: \n' \ + f' 平均值是 {avg_bw}, \n' \ + f' 但最大值是 {round(max(data_list), 3)}GB/s ,\n' \ + f' 最小值是 {round(min(data_list), 3)}GB/s。\n' \ + f' 差距为 {round(max(data_list) - min(data_list), 3)}GB/s。 \n' def format_details(self): if not self.rank_bw_dict: diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py index 7b2311157..d96956f32 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py @@ -21,6 +21,7 @@ from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataset from profiler.advisor.utils.utils import safe_index_value, safe_division, convert_to_int, safe_index, convert_to_float +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -30,6 +31,7 @@ class SlowRankAnalyzer(BaseAnalyzer): RANK = "rank" RATIO_THRESHOLD = 0.05 BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] + BOTTLENECK_LIST_CN = ['计算', '通信', "空闲"] dataset_cls_list = [ClusterStepTraceTimeDataset] COMPUTE = "compute(us)" FREE = "free(us)" @@ -80,16 +82,26 @@ class SlowRankAnalyzer(BaseAnalyzer): self.produce_bottleneck(self.step_trace_dict, i, mean_total_time) if not self.bottelneck: - self.bottelneck = "There is no slow rank issues" + language = AdditionalArgsManager().language + if language == "en": + self.bottelneck = "There is no slow rank issues" + else: + self.bottelneck = "没有慢节点问题" def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float): data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()] max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time) if max_ratio > self.RATIO_THRESHOLD: - self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} \n' \ - f' has some issues in the cluster, \n' \ - f' because the max difference of {self.BOTTLENECK_LIST[produce_type]} time \n' \ - f' has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n' + language = AdditionalArgsManager().language + if language == "en": + self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} \n' \ + f' has some issues in the cluster, \n' \ + f' because the max difference of {self.BOTTLENECK_LIST[produce_type]} time \n' \ + f' has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n' + else: + self.bottelneck += f'集群中的{self.BOTTLENECK_LIST_CN[produce_type]}有问题, \n' \ + f'因为{self.BOTTLENECK_LIST_CN[produce_type]}时间的最大差距已经达到 \n' \ + f'{round(max_ratio * mean_total_time / 1000, 3)}ms。 \n' def make_record(self): """ diff --git a/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py b/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py index 3a9ebd86e..993f04ea8 100644 --- a/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py +++ b/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py @@ -20,6 +20,7 @@ from profiler.advisor.dataset.profiling.info_collection import HcclTask from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.file_manager import FileManager from profiler.advisor.utils.utils import safe_division from profiler.prof_common.constant import Constant @@ -49,9 +50,15 @@ class ByteAlignmentChecker: self.abnormal_ops = [] self.suggestions = [] self._init_rule() - self.headers = [ - "op name", "total size(Byte)", "duration(us)", "abnormal duration(us)", "bandwidth(GB/s)" - ] + language = AdditionalArgsManager().language + if language == "en": + self.headers = [ + "op name", "total size(Byte)", "duration(us)", "abnormal duration(us)", "bandwidth(GB/s)" + ] + else: + self.headers = [ + "算子名称", "总大小(Byte)", "时长(us)", "异常时长(us)", "带宽(GB/s)" + ] @staticmethod def _calculate_bandwidth_gb_s(size, duration): @@ -76,11 +83,18 @@ class ByteAlignmentChecker: """ make record for what and how to optimize """ - optimization_item = OptimizeItem("byte alignment analysis", self.desc, self.suggestions) + language = AdditionalArgsManager().language + if language == "en": + problem_str = "Byte Alignment Analysis" + else: + problem_str = "字节对齐分析" + optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - sub_table_name = "Byte Alignment Analysis" if not self.stage else f"Stage-{self.stage}: " \ - f"Byte Alignment Analysis" + if language == "en": + sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str} " + else: + sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str} " result.add_detail(sub_table_name, headers=self.headers) for hccl_op in self.abnormal_ops: result.add_detail(sub_table_name, detail=hccl_op) @@ -125,9 +139,11 @@ class ByteAlignmentChecker: return [size, duration, abnormal_dur, flag] def _init_rule(self): + language = AdditionalArgsManager().language rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), "rules", + language, "byte_alignment.yaml" ) @@ -140,7 +156,7 @@ class ByteAlignmentChecker: raise RuntimeError("The configuration file of the byte alignment analyzer is abnormal. Please check.") for solution in self.solutions: for key, val in solution.items(): - self.suggestions.append(f"{key}, {val.get('desc')}") + self.suggestions.append(f"{val.get('desc')}") def _get_priority(self): if safe_division(self.abnormal_ops_dur, self.total_ops_dur) < self._LOW_PRIORITY: diff --git a/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py b/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py index acba63dc1..218876db8 100644 --- a/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py +++ b/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py @@ -23,6 +23,7 @@ from profiler.prof_common.file_manager import FileManager from profiler.advisor.utils.utils import convert_to_float from profiler.advisor.dataset.cluster.hccl_collection import HcclInfo from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -70,7 +71,11 @@ class BandwidthContentionChecker: self.abnormal_sdma_list: List[SDMAOperator] = [] self.suggestions = [] self._init_rule() - self.headers = ["op name", "duration(ms)", "bandwidth(GB/s)"] + language = AdditionalArgsManager().language + if language == "en": + self.headers = ["op name", "duration(ms)", "bandwidth(GB/s)"] + else: + self.headers = ["算子名称", "时长(ms)", "带宽(GB/s)"] @staticmethod def check_sdma_operator(hccl_op: HcclInfo): @@ -135,11 +140,20 @@ class BandwidthContentionChecker: """ make record for what and how to optimize """ - optimization_item = OptimizeItem("bandwidth contention analysis", self.desc, self.suggestions) + language = AdditionalArgsManager().language + + if language == "en": + problem_str = "Bandwidth Contention Analysis" + else: + problem_str = "带宽分析" + + optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - sub_table_name = "Bandwidth Contention Analysis" if not self.stage else f"Stage-{self.stage}: " \ - f"Bandwidth Contention Analysis" + if language == "en": + sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" + else: + sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" result.add_detail(sub_table_name, headers=self.headers) for hccl_op in self.abnormal_sdma_list: result.add_detail(sub_table_name, detail=[hccl_op.name, round(hccl_op.dur, 4), round(hccl_op.bandwidth, 2)]) @@ -157,9 +171,11 @@ class BandwidthContentionChecker: priority_background_color=priority) def _init_rule(self): + language = AdditionalArgsManager().language contention_rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), "rules", + language, "bandwidth_contention.yaml" ) @@ -172,4 +188,4 @@ class BandwidthContentionChecker: raise RuntimeError("The configuration file of the bandwidth contention analyzer is abnormal. Please check.") for solution in self.solutions: for key, val in solution.items(): - self.suggestions.append(f"{key}, {val.get('desc')}") + self.suggestions.append(f"{val.get('desc')}") diff --git a/profiler/advisor/analyzer/communication/packet/packet_checker.py b/profiler/advisor/analyzer/communication/packet/packet_checker.py index 250d76af3..7b556fa8d 100644 --- a/profiler/advisor/analyzer/communication/packet/packet_checker.py +++ b/profiler/advisor/analyzer/communication/packet/packet_checker.py @@ -19,6 +19,7 @@ from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.prof_common.file_manager import FileManager from profiler.advisor.utils.utils import convert_to_float +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -109,10 +110,19 @@ class PacketChecker: """ make record for what and how to optimize """ - optimization_item = OptimizeItem("Packet analysis", self.desc, self.suggestions) + language = AdditionalArgsManager().language + if language == "en": + problem_str = "Packet analysis" + else: + problem_str = "包分析" + optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - sub_table_name = "Packet Analysis" if not self.stage else f"Stage-{self.stage}: Packet Analysis" + if language == "en": + sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" + else: + sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + result.add_detail(sub_table_name, headers=self.headers) result.add_detail(sub_table_name, detail=self.small_packet_detail) @@ -128,9 +138,11 @@ class PacketChecker: priority_background_color=priority) def _init_rule(self): + language = AdditionalArgsManager().language syncbn_rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), "rules", + language, "packet.yaml" ) diff --git a/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py index bde967918..1dcef3235 100644 --- a/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py +++ b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py @@ -19,6 +19,7 @@ from collections import defaultdict from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.file_manager import FileManager from profiler.advisor.dataset.cluster.hccl_collection import HcclInfo from profiler.prof_common.constant import Constant @@ -102,11 +103,18 @@ class CommunicationRetransmissionChecker: """ make record for what and how to optimize """ - optimization_item = OptimizeItem("Communication retransmission analysis", self.desc, self.suggestions) + language = AdditionalArgsManager().language + if language == "en": + problem_str = "Communication retransmission analysis" + else: + problem_str = "通信重传分析" + optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - sub_table_name = \ - "Comm Retransmission Analysis" if not self.stage else f"Stage-{self.stage}: Comm Retransmission Analysis" + if language == "en": + sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" + else: + sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_rdma_list: @@ -124,9 +132,11 @@ class CommunicationRetransmissionChecker: priority_background_color=priority) def _init_rule(self): + language = AdditionalArgsManager().language syncbn_rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), "rules", + language, "rdma_analysis.yaml" ) diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py index f42b95147..c8bdea37b 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py @@ -19,6 +19,7 @@ from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import convert_to_float +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -79,11 +80,19 @@ class AICoreFreqChecker: if not self.ai_core_freq_issues: return - self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " - f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") - if self.rank: - self.desc = f"For rank {self.rank}, " + self.desc.lower() - self.suggestions = "Please check the temperature or max power of your machine." + language = AdditionalArgsManager().language + if language == "en": + self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " + f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") + if self.rank: + self.desc = f"For rank {self.rank}, " + self.desc.lower() + self.suggestions = "Please check the temperature or max power of your machine." + else: + self.desc = ( + f"在降频期间发现{len(self.decrease_freq_ops)}个算子,频率降低比例超过了{self.DECREASE_FREQ_RATIO}。") + if self.rank: + self.desc = f"对于{self.rank}号卡," + self.desc + self.suggestions = "请检查您的机器温度或最大功率。" def make_record(self, result: OptimizeResult): """ diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py index 3c6410892..8b1171664 100644 --- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -22,15 +22,19 @@ from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker im from profiler.advisor.dataset.dataset import Dataset from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset from profiler.advisor.dataset.timeline_event_dataset import ComputationAnalysisDataset +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.file_manager import FileManager from profiler.prof_common.constant import Constant class AicpuChecker(OperatorChecker): _CHECKER = "aicpu operator" _PROBLEM = "AICPU operator" + _PROBLEM_CN = "AICPU算子" _MIN_TASK_DURATION = 20 _description = f"Some operators and task duration exceed {_MIN_TASK_DURATION} us, such as :\n" + _description_cn = f"一些算子和任务执行时间超过了{_MIN_TASK_DURATION}us,比如:\n" _SUGGESTION: List[str] = ["Modify code to avoid aicpu operator"] + _SUGGESTION_CN: List[str] = ["修改代码避免使用aicpu类算子"] STACK_INFO_ITEMS = "stack_info" SUGGESTION_INFO_ITEMS = "suggestions" _ITEMS = [ @@ -46,7 +50,9 @@ class AicpuChecker(OperatorChecker): self.total_task_duration = 0.0 self.aicpu_task_duration = 0.0 - def load_aicpu_rules(self, rule_path="rules/aicpu_rules.yaml"): + def load_aicpu_rules(self): + language = AdditionalArgsManager().language + rule_path = "rules/" + language + "/aicpu_rules.yaml" if not os.path.isabs(rule_path): rule_path = os.path.join(os.path.dirname(__file__), "../../../", rule_path) @@ -152,8 +158,13 @@ class AicpuChecker(OperatorChecker): and op.op_name not in double_type_ai_cpu_operator): double_type_ai_cpu_operator.append(op.op_name) if bool(double_type_ai_cpu_operator): - self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format( - ",".join(double_type_ai_cpu_operator))) + language = AdditionalArgsManager().language + if language == "en": + self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format( + ",".join(double_type_ai_cpu_operator))) + else: + self._SUGGESTION.append("尝试将double类型的算子转换成float,比如{}".format( + ",".join(double_type_ai_cpu_operator))) return True def make_render(self, html_render, record, add_render_list=True, **kwargs): diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index 18b46b517..2e2877717 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -19,6 +19,7 @@ from profiler.advisor.analyzer.computation.operator_checker import OperatorCheck from profiler.prof_common.constant import Constant from profiler.advisor.config.config import Config from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -27,9 +28,11 @@ class BlockDimChecker(OperatorChecker): _SUGGESTION: List[str] = [] _CHECKER = "block dim" _PROBLEM = "block dim" + _PROBLEM_CN = "AICore核数" _aicore_num = 0 _aiv_num = 0 _description = "some operator does not make full use of {} ai core" + _description_cn = "一些算子没有充分利用{}个AICore核" _ITEMS = [ "op_name", "op_type", "task_type", "task_duration", "income", "block_dim", "mix_block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" @@ -82,11 +85,19 @@ class BlockDimChecker(OperatorChecker): self._aiv_num = int(Config().get_config("aiv_num")) except ValueError as e: logger.warning("get aiv_num failed, please check info.json: %s", e) - self._description = self._description.format(self._aicore_num) - if self._aiv_num: - self._description += f" or {self._aiv_num} ai vector core" - self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \ - "task duration are as follows:\n" + + language = AdditionalArgsManager().language + if language == "en": + self._description = self._description.format(self._aicore_num) + if self._aiv_num: + self._description += f" or {self._aiv_num} ai vector core" + self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \ + "task duration are as follows:\n" + else: + self._description_cn = self._description_cn.format(self._aicore_num) + if self._aiv_num: + self._description_cn += f"或者{self._aiv_num}个AIVector核" + self._description_cn += f";\n 任务耗时最长的{OperatorChecker._MAX_TUNE_OP_NUM}个算子如下:" return True def _check_operator(self, op_info) -> bool: diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index 5e2ee2251..fab954a9c 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -28,10 +28,13 @@ class OperatorBoundChecker(OperatorChecker): _MIN_TASK_DURATION = 20 # min task duration 20us _CHECKER = "operator no bound" _PROBLEM = "operator no bound" + _PROBLEM_CN = "算子瓶颈" _SUGGESTION: List[str] = [] _description = ( f"There is no mte, cube, vector, scalar ratio is more than {to_percent(Config().operator_bound_ratio)};\n" + f"Top task duration operators need to be tuned are as follows: \n") + _description_cn = ("mte,cube,vetor,scalar比都没有超过 {to_percent(Config().operator_bound_ratio)};\n" + "需要调整的任务执行时间最长的算子如下: \n") _ITEMS = [ "op_name", "op_type", "task_type", "task_duration", "vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio", "block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes", diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index 4ca563e7f..eb341d0bf 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -20,6 +20,7 @@ from profiler.advisor.analyzer.computation.operator_checker import OperatorCheck from profiler.advisor.config.config import Config from profiler.advisor.dataset.profiling.info_collection import OpInfo from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -27,12 +28,20 @@ logger = logging.getLogger() class DynamicShapeChecker(OperatorChecker): ENABLE_COMPILED_SUGGESTION = "1. Please try to set environment by execute `export HOST_CACHE_CAPACITY=20`.\n." \ "2. Please place the following code at the entrance of the python script to disable jit compile.\n " \ - "Code: `torch_npu.npu.set_compile_mode(jit_compile=False);\n " \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ "torch_npu.npu.config.allow_internal_format = False`.\n" + ENABLE_COMPILED_SUGGESTION_CN = "1. 尝试设置环境变量'export HOST_CACHE_CAPACITY=20'。\n" \ + "2. 在python脚本入口加入以下代码关闭在线编译:\n" \ + "'torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ + "torch_npu.npu.config.allow_internal_format = False' \n" _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION] + _SUGGESTION_CN: List[str] = [ENABLE_COMPILED_SUGGESTION_CN] _CHECKER = "dynamic shape operator" _PROBLEM = "Dynamic shape operator" + _PROBLEM_CN = "动态shape算子" _description = f"Found all operators are dynamic shape" + _description_cn = f"找到所有是动态shape的算子" + _op_list: List[OpInfo] = [] _tune_op_list: List[str] = [] # record op name to be tuned, and save to tune_ops_file.cfg _op_views: List = [] @@ -47,14 +56,23 @@ class DynamicShapeChecker(OperatorChecker): """ make record for what and how to optimize """ - - if rank is not None: - self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() - optimization_item = OptimizeItem( - self._PROBLEM, - self._description, - self._SUGGESTION - ) + language = AdditionalArgsManager().language + if language == "en": + if rank is not None: + self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() + optimization_item = OptimizeItem( + self._PROBLEM, + self._description, + self._SUGGESTION + ) + else: + if rank is not None: + self._PROBLEM_CN = f"{rank}号卡 ".capitalize() + self._PROBLEM_CN + optimization_item = OptimizeItem( + self._PROBLEM_CN, + self._description_cn, + self._SUGGESTION_CN + ) statistics_item = StatisticsItem("", "", 1) return OptimizeRecord(optimization_item, statistics_item) @@ -72,6 +90,9 @@ class DynamicShapeChecker(OperatorChecker): if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION: release_suggestion += \ f"for details please refer to link : LINK" + elif release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION_CN: + release_suggestion += \ + f"详细信息请参考:链接" release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} return format_result diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index a58fc0d89..84215be9a 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -25,6 +25,7 @@ from profiler.advisor.dataset.profiling.info_collection import OpInfo from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord from profiler.advisor.utils.utils import safe_division, convert_to_float +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -37,19 +38,29 @@ class OperatorChecker(VersionControl): _MIN_TOTAL_DURATION_RATIO = 1.0 _CHECKER = str() _PROBLEM = str() + _PROBLEM_CN = str() _description = str() + _description_cn = str() STACK_INFO_ITEMS = "" _ITEMS: List[str] = [] _SUGGESTION: List[str] = [] + _SUGGESTION_CN: List[str] = [] SKIP_CHECK_MSG = "Skip %s checker because of not containing %s" _tune_op_info_list: List[OpInfo] = [] PyTorch_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE, such as:\n" \ f"'aoe --job_type=2 --model_path=$user_dump_path " \ f"--tune_ops_file={Config().tune_ops_file}'\n" + PyTorch_OPERATOR_TUNE_SUGGESTION_CN = f"通过AOE优化算子,使用样例如下:\n" \ + f"'aoe --job_type=2 --model_path=$user_dump_path " \ + f"--tune_ops_file={Config().tune_ops_file}'\n" MSLite_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \ f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ f"--modelFile=$user_model.onnx --outputFile=user_model " \ f"--configFile=./config.txt\n" + MSLite_OPERATOR_TUNE_SUGGESTION_CN = f"在Mindpore Lite 框架通过AOE优化算子,使用样例如下:\n" \ + f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ + f"--modelFile=$user_model.onnx --outputFile=user_model " \ + f"--configFile=./config.txt\n" def __init__(self, cann_version: str): self.cann_version = cann_version @@ -116,9 +127,12 @@ class OperatorChecker(VersionControl): :param profiling_data: profiling data :return: optimize record """ - + language = AdditionalArgsManager().language if rank is not None: - self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() + if language == "en": + self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() + else: + self._PROBLEM_CN = f"{rank}号卡".capitalize() + self._PROBLEM_CN task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list @@ -127,11 +141,19 @@ class OperatorChecker(VersionControl): total_task_duration = profiling_data.op_summary.get_total_task_duration() count = len(task_duration_list) statistics_item = StatisticsItem(total_task_duration, total_cost_time, count, self.get_incomes()) - optimization_item = OptimizeItem( - self._PROBLEM, - self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), - self._SUGGESTION - ) + if language == "en": + optimization_item = OptimizeItem( + self._PROBLEM, + self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), + self._SUGGESTION + ) + else: + optimization_item = OptimizeItem( + self._PROBLEM_CN, + self._get_description(self._description_cn, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), + self._SUGGESTION_CN + ) + return OptimizeRecord(optimization_item, statistics_item) def _get_description(self, description, op_type_list=None): @@ -199,6 +221,9 @@ class OperatorChecker(VersionControl): if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION: release_suggestion += \ (f"for details please refer to link : LINK") + elif release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION_CN: + release_suggestion += \ + (f"详细信息请参考:链接") elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION: release_suggestion += \ (f"\nThe config file for MSLite AOE usage is as follows:\n" \ @@ -207,6 +232,14 @@ class OperatorChecker(VersionControl): f"--tune_ops_file={Config().tune_ops_file}\n" f"\nFor details please refer to link : LINK") + elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION_CN: + release_suggestion += \ + (f"\nMSLite AOE的配置文件如下usage:\n" \ + f"[ascend_context]\n" \ + f"aoe_mode=\"operator tuning\"\n" \ + f"--tune_ops_file={Config().tune_ops_file}\n" + f"\n详细信息请参考:链接") release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = { "record": record.__dict__, @@ -321,10 +354,17 @@ class OperatorChecker(VersionControl): return details def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - if profiling_data.prof_type == EnumParamsParser().profiling_type.ascend_pytorch_profiler: - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) - elif profiling_data.prof_type == EnumParamsParser.profiling_type.mslite: - self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) + language = AdditionalArgsManager().language + if profiling_data.PROF_TYPE == EnumParamsParser().profiling_type.ascend_pytorch_profiler: + if language == "en": + self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) + else: + self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION_CN) + elif profiling_data.PROF_TYPE == EnumParamsParser.profiling_type.mslite: + if language == "en": + self._SUGGESTION_CN.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) + else: + self._SUGGESTION_CN.append(self.MSLite_OPERATOR_TUNE_SUGGESTION_CN) def _check_data(self, profiling_data): return True diff --git a/profiler/advisor/analyzer/dataloader/dataloader_checker.py b/profiler/advisor/analyzer/dataloader/dataloader_checker.py index de9f7d7ca..7fade10cd 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_checker.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_checker.py @@ -20,6 +20,7 @@ import yaml from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.file_manager import FileManager logger = logging.getLogger() @@ -80,9 +81,11 @@ class DataloaderChecker: rank=kwargs.get("rank")) def _init_rule(self): + language = AdditionalArgsManager().language dataloader_rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "rules", + language, "dataloader.yaml" ) dataloader_rule = FileManager.read_yaml_file(dataloader_rule_path) diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py index 2cfde931a..96505cb6e 100644 --- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py @@ -23,6 +23,7 @@ from profiler.advisor.common.graph.graph import Graph from profiler.advisor.common.graph.graph_parser import QueryGraphParser from profiler.advisor.dataset.graph_dataset import GraphDataset from profiler.advisor.common.graph.graph_match import find_isomorphisms +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -180,11 +181,20 @@ class GraphFusionRules: if not self.candidates: return - optimization_item = OptimizeItem( - "fusion issue", - f"Found {len(self.candidates)} fusion issues", - ["Check fusion issues detail in mstt_advisor*.html"] - ) + language = AdditionalArgsManager().language + if language == "en": + optimization_item = OptimizeItem( + "fusion issue", + f"Found {len(self.candidates)} fusion issues", + ["Check fusion issues detail in mstt_advisor*.html"] + ) + else: + optimization_item = OptimizeItem( + "融合问题", + f"发现 {len(self.candidates)} 个融合问题", + ["在mstt_advisor*.html中查看融合问题的细节信息"] + ) + total_time = 0.0 for candidate in self.task_duration_list: for duration in candidate: diff --git a/profiler/advisor/analyzer/overall/environment_variable_checker.py b/profiler/advisor/analyzer/overall/environment_variable_checker.py index 700946e90..dbacdf183 100644 --- a/profiler/advisor/analyzer/overall/environment_variable_checker.py +++ b/profiler/advisor/analyzer/overall/environment_variable_checker.py @@ -21,6 +21,7 @@ from profiler.advisor.result.item import OptimizeRecord from profiler.advisor.common.analyzer_scopes import SupportedScopes from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.utils.utils import convert_to_int +from profiler.prof_common.additional_args_manager import AdditionalArgsManager class EnvironmentVariabelChecker: @@ -35,7 +36,11 @@ class EnvironmentVariabelChecker: "ASCEND_LAUNCH_BLOCKING": lambda x: convert_to_int(x) != 1, } - HEADERS = ["Environment", "Value", "Description", "Suggestion"] + language = AdditionalArgsManager().language + if language == "en": + HEADERS = ["Environment", "Value", "Description", "Suggestion"] + else: + HEADERS = ["环境变量", "值", "描述", "建议"] def __init__(self): self.environment_info = self.read_environment_info() @@ -44,9 +49,11 @@ class EnvironmentVariabelChecker: @staticmethod def read_environment_info(): + language = AdditionalArgsManager().language environment_variable_info_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "rules", + language, "environment_variable_info.yaml" ) return FileManager.read_yaml_file(environment_variable_info_path) @@ -78,8 +85,14 @@ class EnvironmentVariabelChecker: def make_record(self, result: OptimizeResult): if not self.env_suggest_csv: return - desc = f"Describe and suggest the optimal environment variable settings" - suggestion = "Please set the optimal environment variable" + language = AdditionalArgsManager().language + if language == "en": + desc = f"Describe and suggest the optimal environment variable settings" + suggestion = "Please set the optimal environment variable" + else: + desc = f"描述并给出最优的环境变量配置建议" + suggestion = "请设置最优的环境变量" + optimization_item = OptimizeItem( SupportedScopes.ENVIRONMENT_VARIABLE_ANALYSIS, diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index 143eb854b..90b83e82c 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -20,42 +20,79 @@ from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.result.result import OptimizeResult from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface -from profiler.prof_common.constant import Constant +from profiler.prof_common.additional_args_manager import AdditionalArgsManager + class OverallSummaryAnalyzer(BaseAnalyzer): - OVERALL_SUMMARY_ANALYZER = "overall summary" - advice_map = { - "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", - "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", - "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" - } - time_name_map = { - "Computing Time": "computing", - "Uncovered Communication Time": "communication", - "Free Time": "free", - 'Cube Time(Num)': 'Cube Time', - 'Vector Time(Num)': 'Vector Time', - 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', - 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', - 'Other Time': "Other Computing Time", - 'SDMA Time(Num)': 'SDMA Time' - } - performance_time_dict = { - "Computing Time": "computing_time_ms", - " -- Flash Attention": "fa_time_ms", - " -- Conv": "conv_time_ms", - " -- Matmul": "matmul_time_ms", - " -- Vector": "vector_time_ms", - " -- SDMA(Tensor Move)": "tensor_move_time_ms", - " -- Other Cube": "other_cube_time_ms", - "Uncovered Communication Time": "uncovered_communication_time_ms", - " -- Wait": "wait_time_ms", - " -- Transmit": "transmit_time_ms", - "Free Time": "free_time_ms", - " -- SDMA": "sdma_time_ms", - " -- Free": "free_ms", - "E2E Time": "e2e_time_ms" - } + language = AdditionalArgsManager().language + if language == "en": + OVERALL_SUMMARY_ANALYZER = "overall summary" + advice_map = { + "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" + } + time_name_map = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + performance_time_dict = { + "Computing Time": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- Other Cube": "other_cube_time_ms", + "Uncovered Communication Time": "uncovered_communication_time_ms", + " -- Wait": "wait_time_ms", + " -- Transmit": "transmit_time_ms", + "Free Time": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- Free": "free_ms", + "E2E Time": "e2e_time_ms" + } + else: + OVERALL_SUMMARY_ANALYZER = "整网耗时分析" + advice_map = { + "计算时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", + "未被掩盖的通信时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", + "空闲时长": "如果你想了解更多详细建议请看mstt_advisor_*.html" + } + time_name_map = { + "计算时长": "computing", + "未被掩盖的通信时长": "communication", + "空闲时长": "free", + 'Cube算子时长(数量)': 'Cube Time', + 'Vector算子时长(数量)': 'Vector Time', + 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', + 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', + '其它时长': "Other Computing Time", + 'SDMA时长(数量)': 'SDMA Time' + } + performance_time_dict = { + "计算时长": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- 其它Cube": "other_cube_time_ms", + "未被掩盖的通信时长": "uncovered_communication_time_ms", + " -- 等待时长": "wait_time_ms", + " -- 传输时长": "transmit_time_ms", + "空闲时长": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- 空闲时长": "free_ms", + "E2E时长": "e2e_time_ms" + } def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): profile_path = get_profile_path(collection_path) @@ -81,11 +118,19 @@ class OverallSummaryAnalyzer(BaseAnalyzer): @staticmethod def get_time_category_dict(overall_dict: dict): - time_category_dict = { - "Computing Time": round(overall_dict.get('computing_time_ms', 0.0), 3), - "Uncovered Communication Time": round(overall_dict.get('uncovered_communication_time_ms', 0.0), 3), - "Free Time": round(overall_dict.get('free_time_ms', 0.0), 3) - } + language = AdditionalArgsManager().language + if language == "en": + time_category_dict = { + "Computing Time": round(overall_dict.get('computing_time_ms', 0.0), 3), + "Uncovered Communication Time": round(overall_dict.get('uncovered_communication_time_ms', 0.0), 3), + "Free Time": round(overall_dict.get('free_time_ms', 0.0), 3) + } + else: + time_category_dict = { + "计算时长": round(overall_dict.get('computing_time_ms', 0.0), 3), + "未被掩盖的通信时长": round(overall_dict.get('uncovered_communication_time_ms', 0.0), 3), + "空闲时长": round(overall_dict.get('free_time_ms', 0.0), 3) + } return time_category_dict def path_check(self): @@ -111,14 +156,25 @@ class OverallSummaryAnalyzer(BaseAnalyzer): if not overall_data: return e2e_time = round(sum([data for data in overall_data.values()]), 3) - overall_bottleneck = f"The Model E2E Time is {e2e_time}ms.\n" + + language = AdditionalArgsManager().language + if language == "en": + overall_bottleneck = f"The Model E2E Time is {e2e_time}ms.\n" + else: + overall_bottleneck = f"模型E2E的时间是{e2e_time}ms。\n" comparison_bottleneck = "" for time_type, time_value in overall_data.items(): # add overall bottleneck - overall_bottleneck += f" -- {time_type} is {time_value}ms\n" + if language == "en": + overall_bottleneck += f" -- {time_type} is {time_value}ms\n" + else: + overall_bottleneck += f" -- {time_type}是{time_value}ms\n" if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value, e2e_time) > 0.1: - overall_bottleneck += "percentage of free time exceed the threshold 10%." + if language == "en": + overall_bottleneck += "percentage of free time exceed the threshold 10%." + else: + overall_bottleneck += "空闲时间的百分比超过了阈值的10%。" if not self._has_benchmark_profiling: continue # add comparison bottleneck @@ -127,7 +183,10 @@ class OverallSummaryAnalyzer(BaseAnalyzer): ).get(time_type) if time_value > base_duration: ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration)) - comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" + if language == "en": + comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" + else: + comparison_bottleneck += f"{time_type}超过了基线{ratio}。\n" self.cur_bottleneck["overall_data"] = overall_bottleneck if comparison_bottleneck: self.cur_bottleneck["comparison_result"] = comparison_bottleneck @@ -151,11 +210,18 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.bottleneck_str = result def format_over_summary_analysis(self): - headers = ['Performance Index', 'Duration(ms)', 'Duration Ratio'] + language = AdditionalArgsManager().language + if language == "en": + headers = ['Performance Index', 'Duration(ms)', 'Duration Ratio'] + else: + headers = ['性能分析', '时长(ms)', '时长占比'] performance_data = self.get_analysis_data(self._disaggregate_perf) benchmark_data = self.get_analysis_data(self._disaggregate_benchmark_perf) if self._has_benchmark_profiling: - headers.append('Diff Duration(ms)') + if language == "en": + headers.append('Diff Duration(ms)') + else: + headers.append('时长差距(ms)') self.format_analysis_with_benchmark(performance_data, benchmark_data, headers) else: self.format_analysis_only(performance_data, headers) diff --git a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py index 4f84d93a5..71b44dd99 100644 --- a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py +++ b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -23,6 +23,7 @@ from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.result.result import OptimizeResult from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -79,17 +80,31 @@ class OpDispatchAnalyzer(BaseAnalyzer): """ if not self._op_compile or len(self._issues_record) <= 0: return - desc = f"Found {self._op_compile.total_count} operator compile issues." - suggestion = ("Please place the following code at the entrance of the python script to disable jit compile. " \ - "Code: `torch_npu.npu.set_compile_mode(jit_compile=False); " - "torch_npu.npu.config.allow_internal_format = False`") - self.optimization_item.append(OptimizeItem("Operator dispatch", desc, [suggestion])) + language = AdditionalArgsManager().language + if language == "en": + desc = f"Found {self._op_compile.total_count} operator compile issues." + suggestion = ("Please place the following code at the entrance of the python script to disable jit compile. \n" \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False) \n" + "torch_npu.npu.config.allow_internal_format = False` \n") + problem_str = "Operator dispatch" + else: + desc = f"发现{self._op_compile.total_count}个算子编译问题。" + suggestion = ( + "请在python脚本入口添加以下代码关闭在线编译:\n" \ + "'torch_npu.npu.set_compile_mode(jit_compile=False) \n" + "torch_npu.npu.config.allow_internal_format = False' \n") + problem_str = "算子下发" + self.optimization_item.append(OptimizeItem(problem_str, desc, [suggestion])) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - record_title = ["Issues", "op name", "counts", "total time"] - result.add_detail('operator dispatch', headers=record_title) + + if language == "en": + record_title = ["Issues", "op name", "counts", "total time"] + else: + record_title = ["问题", "算子名称", "数量", "总时长"] + result.add_detail(problem_str, headers=record_title) for op_info in self._issues_record: - result.add_detail('operator dispatch', detail=op_info) + result.add_detail(problem_str, detail=op_info) def make_render(self, html_render, **kwargs): issues = [] diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index 098fab153..feacbb37b 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -29,6 +29,7 @@ from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.utils.utils import format_timeline_result from profiler.advisor.common.timeline.fusion_ops_db import init_timeline_ops_db from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -94,17 +95,29 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): """ if not self.matched_op_stacks: return + language = AdditionalArgsManager().language + if language == "en": + desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \ + f" based on the runtime env cann-{self.cann_version} and torch-{self.torch_version}" + + suggestion = "Please replace training api according to sub table 'Affinity training api'" + if self.empty_stacks: + desc += ", but with no stack" + suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT.format( + timeline_profiling_doc_url=Config().timeline_with_stack_doc_url + ) + sheet_name = "Affinity apis" + else: + desc = f"目前运行环境版本为cann-{self.cann_version}和torch-{self.torch_version}," \ + f"发现有{len(format_timeline_result(self.matched_op_stacks))}个api接口可以替换。" + suggestion = "请根据子表'Affinity training api'替换训练api接口" + if self.empty_stacks: + desc += ",但没有堆栈" + suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT_CN.format( + timeline_profiling_doc_url=Config().timeline_with_stack_doc_url + ) + sheet_name = "亲和API接口" - desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \ - f" based on the runtime env cann-{self.cann_version} and torch-{self.profiling_version}" - suggestion = "Please replace training api according to sub table 'Affinity training api'" - if self.empty_stacks: - desc += ", but with no stack" - suggestion = Constant.TIMELINE_EMPTY_STACKS_PROMPT.format( - timeline_profiling_doc_url=Config().timeline_with_stack_doc_url - ) - - sheet_name = "Affinity apis" optimization_item = OptimizeItem( sheet_name, desc, @@ -113,7 +126,10 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): self.result.add(OptimizeRecord(optimization_item)) - record_title = ["Affinity API", "Code stacks", "Stack called counts"] + if language == "en": + record_title = ["Affinity API", "Code stacks", "Stack called counts"] + else: + record_title = ["亲和API接口", "代码堆栈", "堆栈调用数量"] self.result.add_detail(sheet_name, headers=record_title) for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items(): diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py b/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py index 72c302e1e..17c494dbd 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py @@ -21,6 +21,7 @@ from profiler.advisor.dataset.timeline_event_dataset import ComputationAnalysisD from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.utils.utils import get_analyze_processes, ParallelJob +from profiler.prof_common.additional_args_manager import AdditionalArgsManager logger = logging.getLogger() @@ -90,17 +91,33 @@ class OpStackFinder: if not self._stack_record: return - desc = f"Found {len(self._stack_record)} called stacks for" - if self.op_name and self.task_type: - desc += f" operators with name '{self.op_name}' with task type '{self.task_type}'" - elif self.op_name and not self.task_type: - desc += f" operators with name '{self.op_name}'" - elif self.task_type and not self.op_name: - desc += f" operators with task type '{self.task_type}'" + language = AdditionalArgsManager().language + if language == "en": + desc = f"Found {len(self._stack_record)} called stacks for" + if self.op_name and self.task_type: + desc += f" operators with name '{self.op_name}' with task type '{self.task_type}'" + elif self.op_name and not self.task_type: + desc += f" operators with name '{self.op_name}'" + elif self.task_type and not self.op_name: + desc += f" operators with task type '{self.task_type}'" + else: + desc += " all operators" + + suggestion = f"Please use command 'ma-advisor analyze profiling' to analyze operators" else: - desc += " all operators" + desc = f"发现以下{len(self._stack_record)}个算子的调用堆栈," + if self.op_name and self.task_type: + desc += f"任务类型为'{self.task_type}'的'{self.op_name}'算子" + elif self.op_name and not self.task_type: + desc += f"'{self.op_name}'算子" + elif self.task_type and not self.op_name: + desc += f"算子类型为'{self.task_type}'" + else: + desc += "包括全部算子" + + suggestion = f"请用命令'ma-advisor analyze profiling'分析算子" + - suggestion = f"Please use command 'ma-advisor analyze profiling' to analyze operators" optimization_item = OptimizeItem( "Operator stacks", desc, diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py index 321b80fc6..453f9eb5d 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_checker.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py @@ -20,6 +20,7 @@ from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisData from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.utils.utils import convert_to_float, convert_to_int, safe_division +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.constant import Constant from profiler.prof_common.file_manager import FileManager @@ -84,12 +85,25 @@ class GcChecker: if not self.gc_issues: return - self.optimization_item.append(OptimizeItem("GC", self.desc, self.suggestions)) + language = AdditionalArgsManager().language + if language == "en": + problem_str = "GC Analysis" + else: + problem_str = "GC分析" + + self.optimization_item.append(OptimizeItem(problem_str, self.desc, self.suggestions)) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - if self.rank is not None: - self.headers = ["Rank id"] + self.headers - sub_table_name = "GcAnalysis" if not self.stage else f"Stage-{self.stage}: GcAnalysis" + + if language == "en": + if self.rank is not None: + self.headers = ["Rank id"] + self.headers + sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" + else: + if self.rank is not None: + self.headers = ["卡号"] + self.headers + sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_gc_list: @@ -164,9 +178,11 @@ class GcChecker: return {} def _init_rule(self): + language = AdditionalArgsManager().language gc_rule_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), "rules", + language, "gc.yaml" ) diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py index 878d5869e..28661aa09 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py @@ -1,89 +1,92 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os - -from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset -from profiler.advisor.result.result import OptimizeResult -from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.prof_common.file_manager import FileManager - -logger = logging.getLogger() - - -class SyncBNChecker: - - def __init__(self): - self.optimization_item = [] - self.syncbn_issues = False - self.desc = "" - self.suggestions = [] - self.solutions = None - self.max_syncbn_num = None - self._init_rule() - - def check_syncbn(self, event_dataset: ScheduleAnalysisDataset): - """ - :Param event_dataset: dataset of timeline event - """ - if not hasattr(event_dataset, "sync_batchnorm") or not getattr(event_dataset, "sync_batchnorm"): - logger.debug("Skip syncbn checker, because no syncbn found") - return - - syncbn_num = len(event_dataset.sync_batchnorm) - self.syncbn_issues = syncbn_num >= self.max_syncbn_num - self.desc = self.desc.format(syncbn_num=syncbn_num) - - def make_record(self, result: OptimizeResult): - """ - make record for what and how to optimize - """ - if not self.syncbn_issues: - return - - self.optimization_item.append(OptimizeItem("SyncBatchNorm", self.desc, self.suggestions)) - for optimization in self.optimization_item: - result.add(OptimizeRecord(optimization)) - - def make_render(self, html_render, **kwargs): - if not self.syncbn_issues: - return - - priority = kwargs.get("priority") - rank = kwargs.get("rank") - html_render.render_template(key="schedule", - template_dir="templates", - template_name="sync_batchnorm.html", - desc=self.desc, - solutions=self.solutions, - priority_background_color=priority, - rank=rank) - - def _init_rule(self): - syncbn_rule_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), - "rules", - "sync_batchnorm.yaml" - ) - - syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) - - self.max_syncbn_num = syncbn_rule.get("max_syncbn_num") - self.desc = syncbn_rule.get("problem") - - self.solutions = syncbn_rule.get("solutions") - for solution in self.solutions: - for key, val in solution.items(): - self.suggestions.append(f"{key}, {val.get('desc')}") +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os + +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.prof_common.additional_args_manager import AdditionalArgsManager +from profiler.prof_common.file_manager import FileManager + +logger = logging.getLogger() + + +class SyncBNChecker: + + def __init__(self): + self.optimization_item = [] + self.syncbn_issues = False + self.desc = "" + self.suggestions = [] + self.solutions = None + self.max_syncbn_num = None + self._init_rule() + + def check_syncbn(self, event_dataset: ScheduleAnalysisDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "sync_batchnorm") or not getattr(event_dataset, "sync_batchnorm"): + logger.debug("Skip syncbn checker, because no syncbn found") + return + + syncbn_num = len(event_dataset.sync_batchnorm) + self.syncbn_issues = syncbn_num >= self.max_syncbn_num + self.desc = self.desc.format(syncbn_num=syncbn_num) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.syncbn_issues: + return + + self.optimization_item.append(OptimizeItem("SyncBatchNorm", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + + def make_render(self, html_render, **kwargs): + if not self.syncbn_issues: + return + + priority = kwargs.get("priority") + rank = kwargs.get("rank") + html_render.render_template(key="schedule", + template_dir="templates", + template_name="sync_batchnorm.html", + desc=self.desc, + solutions=self.solutions, + priority_background_color=priority, + rank=rank) + + def _init_rule(self): + language = AdditionalArgsManager().language + syncbn_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), + "rules", + language, + "sync_batchnorm.yaml" + ) + + syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) + + self.max_syncbn_num = syncbn_rule.get("max_syncbn_num") + self.desc = syncbn_rule.get("problem") + + self.solutions = syncbn_rule.get("solutions") + for solution in self.solutions: + for key, val in solution.items(): + self.suggestions.append(f"{key}, {val.get('desc')}") diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py index 62dc005ec..745882cbd 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py @@ -1,129 +1,132 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os - -from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker -from profiler.prof_common.constant import Constant -from profiler.advisor.config.config import Config -from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset -from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor -from profiler.advisor.result.result import OptimizeResult -from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.utils.utils import format_timeline_result, safe_division -from profiler.prof_common.file_manager import FileManager - -logger = logging.getLogger() - - -class SynchronizeStreamChecker(TimelineBaseChecker): - - def __init__(self): - super().__init__(n_processes=1) - self.optimization_item = [] - self.synchronize_issues = False - self.desc = "" - self.suggestions = [] - self.solutions = [] - self.min_co_occurrence_ratio = 0 - self.priority = None - self._init_rule() - - def check_synchronize(self, event_dataset: ScheduleAnalysisDataset): - if not hasattr(event_dataset, "synchronize_stream") or not getattr(event_dataset, "synchronize_stream"): - logger.info("Skip synchronize stream checker, because no synchronize stream found") - return - - node_launch_num = 0 - co_occurrence_num = 0 - synchronize_num = 0 - synchronize_stream = event_dataset.synchronize_stream - for index, op in enumerate(synchronize_stream): - if op.name.startswith(Constant.NODE_LAUNCH): - node_launch_num += 1 - if op.name.startswith(Constant.SYNC_STREAM): - synchronize_num += 1 - - # 统计nodeLaunch 和 synchronizeStream 一前一后连续出现次数 - if index > 0 and synchronize_stream[index - 1].name.startswith(Constant.NODE_LAUNCH): - co_occurrence_num += 1 - - # 当共现次数很多时,则大概率设置了ASCEND_LAUNCH_BLOCKING环境变量 - co_occurrence_ratio = round(safe_division(co_occurrence_num, node_launch_num), 4) - if co_occurrence_ratio > self.min_co_occurrence_ratio: - self.synchronize_issues = True - - self.priority = self.get_priority() - - self.desc = self.desc.format(synchronize_num=synchronize_num, - node_launch_num=node_launch_num, - co_occur_ratio=co_occurrence_ratio) - - solutions = [] - for solution in solutions: - renderer_solution = {} - for key, val in solution.items(): - self.suggestions.append(f"{key}, {val.get('desc')}") - renderer_solution.update({key: val}) - self.solutions.append(renderer_solution) - - def make_record(self, result: OptimizeResult): - """ - make record for what and how to optimize - """ - if not self.synchronize_issues: - return - - self.optimization_item.append(OptimizeItem("SynchronizeStream", self.desc, self.suggestions)) - for optimization in self.optimization_item: - result.add(OptimizeRecord(optimization)) - - def make_render(self, html_render, **kwargs): - if not self.synchronize_issues: - return - priority = kwargs.get("priority") - rank = kwargs.get("rank") - format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) - html_render.render_template(key="schedule", - template_dir="templates", - template_name="synchronize_stream.html", - desc=self.desc, - solutions=self.solutions, - result=format_result_for_html, - with_stack_doc_url=Config().timeline_with_stack_doc_url, - empty_stacks=self.empty_stacks, - framework_black_list=self.framework_black_list, - priority_background_color=priority, - rank=rank) - - def get_priority(self): - return PriorityBackgroundColor.high - - def _init_rule(self): - synchronize_rule_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), - "rules", - "synchronize.yaml" - ) - - synchronize_rule = FileManager.read_yaml_file(synchronize_rule_path) - - self.min_co_occurrence_ratio = synchronize_rule.get("min_co_occurrence_ratio") - self.desc = synchronize_rule.get("problem") - - self.solutions = synchronize_rule.get("solutions") - for solution in self.solutions: - for key, val in solution.items(): - self.suggestions.append(f"{key}, {val.get('desc')}") +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os + +from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker +from profiler.prof_common.additional_args_manager import AdditionalArgsManager +from profiler.prof_common.constant import Constant +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import format_timeline_result, safe_division +from profiler.prof_common.file_manager import FileManager + +logger = logging.getLogger() + + +class SynchronizeStreamChecker(TimelineBaseChecker): + + def __init__(self): + super().__init__(n_processes=1) + self.optimization_item = [] + self.synchronize_issues = False + self.desc = "" + self.suggestions = [] + self.solutions = [] + self.min_co_occurrence_ratio = 0 + self.priority = None + self._init_rule() + + def check_synchronize(self, event_dataset: ScheduleAnalysisDataset): + if not hasattr(event_dataset, "synchronize_stream") or not getattr(event_dataset, "synchronize_stream"): + logger.info("Skip synchronize stream checker, because no synchronize stream found") + return + + node_launch_num = 0 + co_occurrence_num = 0 + synchronize_num = 0 + synchronize_stream = event_dataset.synchronize_stream + for index, op in enumerate(synchronize_stream): + if op.name.startswith(Constant.NODE_LAUNCH): + node_launch_num += 1 + if op.name.startswith(Constant.SYNC_STREAM): + synchronize_num += 1 + + # 统计nodeLaunch 和 synchronizeStream 一前一后连续出现次数 + if index > 0 and synchronize_stream[index - 1].name.startswith(Constant.NODE_LAUNCH): + co_occurrence_num += 1 + + # 当共现次数很多时,则大概率设置了ASCEND_LAUNCH_BLOCKING环境变量 + co_occurrence_ratio = round(safe_division(co_occurrence_num, node_launch_num), 4) + if co_occurrence_ratio > self.min_co_occurrence_ratio: + self.synchronize_issues = True + + self.priority = self.get_priority() + + self.desc = self.desc.format(synchronize_num=synchronize_num, + node_launch_num=node_launch_num, + co_occur_ratio=co_occurrence_ratio) + + solutions = [] + for solution in solutions: + renderer_solution = {} + for key, val in solution.items(): + self.suggestions.append(f"{key}, {val.get('desc')}") + renderer_solution.update({key: val}) + self.solutions.append(renderer_solution) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.synchronize_issues: + return + + self.optimization_item.append(OptimizeItem("SynchronizeStream", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + + def make_render(self, html_render, **kwargs): + if not self.synchronize_issues: + return + priority = kwargs.get("priority") + rank = kwargs.get("rank") + format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) + html_render.render_template(key="schedule", + template_dir="templates", + template_name="synchronize_stream.html", + desc=self.desc, + solutions=self.solutions, + result=format_result_for_html, + with_stack_doc_url=Config().timeline_with_stack_doc_url, + empty_stacks=self.empty_stacks, + framework_black_list=self.framework_black_list, + priority_background_color=priority, + rank=rank) + + def get_priority(self): + return PriorityBackgroundColor.high + + def _init_rule(self): + language = AdditionalArgsManager().language + synchronize_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), + "rules", + language, + "synchronize.yaml" + ) + + synchronize_rule = FileManager.read_yaml_file(synchronize_rule_path) + + self.min_co_occurrence_ratio = synchronize_rule.get("min_co_occurrence_ratio") + self.desc = synchronize_rule.get("problem") + + self.solutions = synchronize_rule.get("solutions") + for solution in self.solutions: + for key, val in solution.items(): + self.suggestions.append(f"{key}, {val.get('desc')}") diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index c88f52f6c..368869305 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -12,30 +12,60 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from profiler.prof_common.additional_args_manager import AdditionalArgsManager + + class SupportedScopes: # used for specify fourth-level commands and define the key of the result dict # the key defined bellow must be the same as value - TIMELINE_FUSION_OPS = "timeline_fusion_ops" - GRAPH = "graph" - SLOW_RANK = "slow_rank" - SLOW_LINK = "slow_link" - COMMUNICATION_RETRANSMISSION_DETECTION = "communication_retransmission_analysis" - PACKET = "packet_analysis" - BANDWIDTH_CONTENTION_DETECTION = "bandwidth_contention_analysis" - BYTE_ALIGNMENT_DETECTION = "byte_alignment_analysis" - OVER_ALL = "over_all" - ENVIRONMENT_VARIABLE_ANALYSIS = "environment_variable_analysis" - DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" - AICPU_ANALYSIS = "aicpu_analysis" - BLOCK_DIM_ANALYSIS = "block_dim_analysis" - OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" - TIMELINE_OP_DISPATCH = "timeline_op_dispatch" - DATALOADER = "dataloader" - SYNCBN = "syncbn" - SYNCHRONIZE_STREAM = "synchronize_stream" - FREQ_ANALYSIS = "freq_analysis" - MEMORY = "memory" - STAGE_COMPUTE = "stage_compute" - GC_ANALYSIS = "gc_analysis" - COMPARISON = "comparison" + + language = AdditionalArgsManager().language + if language == "en": + TIMELINE_FUSION_OPS = "timeline_fusion_ops" + GRAPH = "graph" + SLOW_RANK = "slow_rank" + SLOW_LINK = "slow_link" + COMMUNICATION_RETRANSMISSION_DETECTION = "communication_retransmission_analysis" + PACKET = "packet_analysis" + BANDWIDTH_CONTENTION_DETECTION = "bandwidth_contention_analysis" + BYTE_ALIGNMENT_DETECTION = "byte_alignment_analysis" + OVER_ALL = "over_all" + ENVIRONMENT_VARIABLE_ANALYSIS = "environment_variable_analysis" + DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" + AICPU_ANALYSIS = "aicpu_analysis" + BLOCK_DIM_ANALYSIS = "block_dim_analysis" + OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" + TIMELINE_OP_DISPATCH = "timeline_op_dispatch" + DATALOADER = "dataloader" + SYNCBN = "syncbn" + SYNCHRONIZE_STREAM = "synchronize_stream" + FREQ_ANALYSIS = "freq_analysis" + MEMORY = "memory" + STAGE_COMPUTE = "stage_compute" + GC_ANALYSIS = "gc_analysis" + COMPARISON = "comparison" + else: + TIMELINE_FUSION_OPS = "融合算子" + GRAPH = "图" + SLOW_RANK = "慢节点" + SLOW_LINK = "慢链路" + COMMUNICATION_RETRANSMISSION_DETECTION = "通信重传分析" + PACKET = "包分析" + BANDWIDTH_CONTENTION_DETECTION = "带宽限制分析" + BYTE_ALIGNMENT_DETECTION = "字节对齐分析" + OVER_ALL = "总览" + ENVIRONMENT_VARIABLE_ANALYSIS = "环境变量分析" + DYNAMIC_SHAPE_ANALYSIS = "动态shape分析" + AICPU_ANALYSIS = "aicpu分析" + BLOCK_DIM_ANALYSIS = "AICore核数分析" + OPERATOR_NO_BOUND_ANALYSIS = "算子瓶颈分析" + TIMELINE_OP_DISPATCH = "调度" + DATALOADER = "数据加载" + SYNCBN = "batchnorm同步" + SYNCHRONIZE_STREAM = "流同步" + FREQ_ANALYSIS = "频率分析" + MEMORY = "内存" + STAGE_COMPUTE = "阶段计算" + GC_ANALYSIS = "gc分析" + COMPARISON = "对比" diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py new file mode 100644 index 000000000..cc76f8e01 --- /dev/null +++ b/profiler/advisor/common/constant.py @@ -0,0 +1,163 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import stat + +# timeline +DEQUEUE = "Dequeue" +DEQUEUE_SEP = "@" +ATEN = "aten" +NPU = "npu" +ATEN_SEP = "::" +OPTIMIZER = "Optimizer" +OPTIMIZER_SEP = "#" +OPTIMIZER_STEP = "step" +ENQUEUE = "enqueue" +TORCH_TO_NPU = "torch_to_npu" +FREE = "free" +OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" +OP_COMPILE_ID = "aclopCompileAndExecute" +SYNC_STREAM = "AscendCL@aclrtSynchronizeStream" +NODE_LAUNCH = "Node@launch" +MAX_OP_COMPILE_NUM = 20 +ACL_TO_NPU = "acl_to_npu" +TASK_TYPE = "Task Type" +CPU_OP = "cpu_op" +AI_CORE = "AI_CORE" +AI_CPU = "AI_CPU" +MIX_AIC = "MIX_AIC" +CALL_STACKS = "Call stack" +INPUT_DIMS = "Input Dims" +OP_SEP = "-" +ADVISOR_MAX_PROCESSES = 8 +ADVISOR_ANALYZE_PROCESSES = "ADVISOR_ANALYZE_PROCESSES" +TIMELINE_OP_STACKS_DATASET = "timeline_op_stacks_dataset" +TIMELINE_BACKWARD_NO_STACK = "Backward broadcast, without call stacks in profiling." +TIMELINE_ACL_TO_NPU_NO_STACK = "Incoming flow is 'acl_to_npu', without call stacks in profiling." +TIMELINE_BACKWARD_NO_STACK_CODE = -1 +TIMELINE_ACL_TO_NPU_NO_STACK_CODE = -2 +TIMELINE_FUSION_OPS_NO_STACK_FLAG = "NO STACK" +NO_STACK_REASON_MAP = { + TIMELINE_BACKWARD_NO_STACK_CODE: "Backward broadcast, without call stacks in profiling.", + TIMELINE_ACL_TO_NPU_NO_STACK_CODE: "Incoming flow is 'acl_to_npu', without call stacks in profiling." +} +AFFINITY_TRAINING_API = "Affinity training api" +TIMELINE_EMPTY_STACKS_PROMPT = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ + "please refer to {timeline_profiling_doc_url} to set 'with_stack=True'. " \ + "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." +TIMELINE_EMPTY_STACKS_PROMPT_CN = "这些API接口没有代码堆栈。如果采集profiling时参数为'with_stack=False'," \ + "请参考{timeline_profiling_doc_url}设置'with_stack=True'。" \ + "另外,由于反向传播没有堆栈,请忽略以下亲和APIs。" +CLUSTER_ANALYSIS = "Cluster analysis" +SLOW_RANK_TIME_RATIO_THRESHOLD = 0.05 + +CANN_VERSION = "cann_version" +TORCH_VERSION = "torch_version" +PROFILING_TYPE = "profiling_type" +ANALYSIS_DIMENSIONS = "analysis_dimensions" + +PROFILER_METADATA = "profiler_metadata.json" + +TERMINAL_OUTPUT_HEADERS = ["No.", "Problem", "Description", "Suggestion"] +SKIP_ANALYZE_PROMPT = "Finish analysis, no optimization suggestions" +SKIP_QUERY_PROMPT = "Finish query operator stack, no operators" + +# operator output constant +OPERATOR_OUT_TOPK = 10 +OPERATOR_LIST_UNLIMIT = -1 + +DEFAULT_OPERATOR_TYPE = 'None_type' +DEFAULT_DURATION_ZERO = 0.0 + +ADVISOR_LOG_LEVEL = "ADVISOR_LOG_LEVEL" +DEFAULT_LOG_LEVEL = "INFO" +SUPPORTED_LOG_LEVEL = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + +RULE_BUCKET = "RULE-BUCKET" +CLOUD_RULE_REGION_CN_NORTH_9 = "cn-north-9" +CLOUD_RULE_REGION_CN_NORTH_7 = "cn-north-7" +CLOUD_RULE_REGION_CN_SOUTHWEST_2 = "cn-southwest-2" +CLOUD_RULE_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7, CLOUD_RULE_REGION_CN_NORTH_9, CLOUD_RULE_REGION_CN_SOUTHWEST_2] +INNER_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7] +DEFAULT_CLOUD_RULE_REGION = CLOUD_RULE_REGION_CN_SOUTHWEST_2 + +HTTP_PREFIXES = "http://" +HTTPS_PREFIXES = "https://" +COMMON_YAML_DIR = "modelarts/solution/ma_advisor_rules/" +COMMON_ENDPOINT_SUFFIX = "obs.{}.myhuaweicloud.com" +INNER_ENDPOINT_SUFFIX = "obs.{}.ulanqab.huawei.com" + +AICPU_RULES_YAML_NAME = "aicpu_rules.yaml" +FUSION_PASS_YAML_NAME = "op_fusion_pass.yaml" +TIMELINE_FUSION_OPS_YAML_NAME = "timeline_fusion_ops.yaml" +CLOUD_YAML_NAME_LIST = [AICPU_RULES_YAML_NAME, FUSION_PASS_YAML_NAME, TIMELINE_FUSION_OPS_YAML_NAME] + +MAX_RETRIES = 3 +TIMEOUT = 3 +DEPTH_LIMIT = 20 + +ADVISOR_RULE_PATH = "ADVISOR_RULE_PATH" +CLOUD_RULE_PATH = "rules/cloud/" +DEFAULT_RULE_PATH = "./rules/" + +TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID = -1 + +DEFAULT_TEMPLATE_HEADER = "Performance Optimization Suggestions" + +PT_PROF_SUFFIX = "ascend_pt" +ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT" +COLLECTION_PATH = "collection_path" +CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" +KERNEL_DETAILS_CSV = "kernel_details.csv" +CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" +CLUSTER_COMM_JSON = "cluster_communication.json" +COMMUNICATION_JSON = "communication.json" + +BOTTLENECK = "bottleneck" +DATA = "data" +ADVISOR_ANALYSIS_OUTPUT_DIR = "advisor_analysis_result" +DEFAULT_PROCESSES = 8 +CLUSTER_ANALYSIS_FILE_PATTERN = [ + r'profiler_info_\d+\.json', "step_trace_time.csv", "communication.json", "communication_matrix.json" +] +ANALYSIS_OUTPUT_PATH = "ANALYSIS_OUTPUT_PATH" +DEFAULT_RANK_FOR_PROFILING_ANALYSIS = 0 +PROFILER_INFO_FILE_PATTERN = r"profiler_info_(\d+)\.json" +DISABLE_STREAMINIG_READER = "DISABLE_STREAMINIG_READER" +FRAMEWORK_STACK_BLACK_LIST = ["torch", "torch_npu", "megatron", "deepspeed"] +DISABLE_STREAMING_READER = "DISABLE_STREAMING_READER" +MAX_FILE_SIZE = 10 ** 10 +MAX_NUM_PROCESSES = 4 +DEFAULT_STEP = "-1" +STEP_RANK_SEP = "_" + +MAX_READ_LINE_BYTES = 8196 * 1024 +MAX_READ_FILE_BYTES = 64 * 1024 * 1024 * 1024 +MAX_READ_DB_FILE_BYTES = 8 * 1024 * 1024 * 1024 + +# Unit Conversion +COMMUNICATION_B_TO_GB = 0.001 ** 3 +US_TO_S = 0.001 ** 2 + +WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP +WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC + +DISABLE_PROFILING_COMPARISON = "DISABLE_PROFILING_COMPARISON" +FREE_DURATION_FOR_GC_ANALYSIS = "FREE_DURATION_FOR_GC_ANALYSIS" +DISABLE_AFFINITY_API = "DISABLE_AFFINITY_API" + +# communication.json +TOTAL_OP_INFO = "Total Op Info" diff --git a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py index 52f15caef..feae1254d 100644 --- a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py +++ b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py @@ -1,398 +1,403 @@ -import logging -import math -import os -from abc import abstractmethod, ABCMeta - -from profiler.prof_common.constant import Constant -from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.advisor.utils.utils import convert_to_float -from profiler.prof_common.file_manager import FileManager - -logger = logging.getLogger() - - -class BaseOpCollector(metaclass=ABCMeta): - - def __init__(self): - self.attribute_to_dataset = {} - self.op_list = [] - self.require_filter_by_step = True - - @abstractmethod - def add_op(self): - """ add timeline event into self.op_list, and then will filter event in self.op_list by specific step - """ - pass - - @abstractmethod - def post_process(self): - """ convert self.op_list to required format like dict, set and so on and then record the final object into - self.attribute_to_dataset which used to set property of timeline event dataset - """ - pass - - -class StepCollector(BaseOpCollector): - KEY_WORD = "ProfilerStep" - - def __init__(self): - super().__init__() - self.require_filter_by_step = False - - def add_op(self, event): - if event.name.startswith(self.KEY_WORD): - self.op_list.append(event) - - def post_process(self, *args, **kwargs): - self.attribute_to_dataset["profiler_step"] = self.op_list - - -class OpCompileCollector(BaseOpCollector): - def __init__(self): - super().__init__() - self._total_op_compile_counter = 0 - self._total_op_compile_time = 0.0 - - @property - def total_time(self): - return self._total_op_compile_time - - @property - def total_count(self): - return self._total_op_compile_counter - - def is_empty(self): - return self._total_op_compile_counter == 0 - - def update(self, event: TimelineEvent): - self._total_op_compile_time += float(event.dur) - self._total_op_compile_counter += 1 - - def unset(self): - self._total_op_compile_counter = 0 - self._total_op_compile_time = 0.0 - - def add_op(self, event): - if event.name == Constant.OP_COMPILE_NAME or event.args.get("id") == Constant.OP_COMPILE_ID: - self.op_list.append(event) - - def post_process(self, target_op_list, **kwargs): - for op in target_op_list: - self.update(op) - - self.attribute_to_dataset["ops_compile"] = self - - -class SynchronizeStreamCollector(BaseOpCollector): - - def __init__(self): - super().__init__() - self.require_filter_by_step = False - - def add_op(self, event): - if event.name.startswith(Constant.SYNC_STREAM) or event.name.startswith(Constant.NODE_LAUNCH): - self.op_list.append(event) - - def post_process(self, *args, **kwargs): - self.op_list.sort(key=lambda x: x.ts) - - self.attribute_to_dataset["synchronize_stream"] = self.op_list - - -class MemCollector(BaseOpCollector): - MEMORY_OP_NAME = ["AscendCL@aclMallocMemInner", "AscendCL@aclrtFreePhysical", "AscendCL@aclrtFree"] - - def __init__(self): - super().__init__() - self.mem_op_info = {} - self.rule = self._load_rule() - - @staticmethod - def _load_rule(): - memory_rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), - "rules", - "memory.yaml") - - memory_rule = FileManager.read_yaml_file(memory_rule_path) - return memory_rule - - def add_op(self, event): - if event.name not in self.MEMORY_OP_NAME: - return - self.op_list.append(event) - - def post_process(self, target_op_list, **kwargs): - for op in target_op_list: - if op.name not in self.mem_op_info: - self.mem_op_info[op.name] = dict(count=0, total_dur=0) - self.mem_op_info[op.name]["count"] += 1 - self.mem_op_info[op.name]["total_dur"] += float(op.dur) - - self.attribute_to_dataset["memory_ops"] = self - - -class DataloaderCollector(BaseOpCollector): - key_word = "dataloader" - - def __init__(self): - super().__init__() - - def add_op(self, event): - if self.key_word in event.name.lower(): - self.op_list.append(TimelineEvent({ - "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur, - "stack": event.args.get("Call stack") - })) - - def post_process(self, *args, **kwargs): - self.attribute_to_dataset["dataloader"] = self.op_list - - -class SyncBNCollector(BaseOpCollector): - key_word = "syncbatchnorm" - - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.name.lower() == self.key_word: - self.op_list.append(TimelineEvent({ - "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur - })) - - def post_process(self, target_op_list, **kwargs): - self.attribute_to_dataset["sync_batchnorm"] = target_op_list - - -class AtenCollector(BaseOpCollector): - - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.name.lower().startswith(f"{Constant.ATEN}{Constant.ATEN_SEP}") or event.name.lower().startswith( - f"{Constant.NPU_LOWER}{Constant.ATEN_SEP}"): - self._add_aten(event) - return - - # 检查cann层同步操作,根据时间窗口索引到host侧的aten算子并给出堆栈 - if event.name.startswith(Constant.SYNC_STREAM): - self._add_aten(event) - - def post_process(self, target_op_list, **kwargs): - self.attribute_to_dataset["aten"] = target_op_list - - def _add_aten(self, event: TimelineEvent): - self.op_list.append(TimelineEvent({ - "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur - })) - - -class OptimizerCollector(BaseOpCollector): - - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.name.startswith(f"{Constant.OPTIMIZER}.{Constant.OPTIMIZER_STEP}{Constant.OPTIMIZER_SEP}"): - self.op_list.append(TimelineEvent( - {"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur})) - - def post_process(self, target_op_list, **kwargs): - self.attribute_to_dataset["optimizer"] = target_op_list - - -class FrequencyCollector(BaseOpCollector): - KEY_WORD = "AI Core Freq" - - def __init__(self): - super().__init__() - self._previous_freq_index = -1 - - @staticmethod - def get_op_frequency(ai_core_ops, ai_core_freq): - ai_core_freq.sort(key=lambda x: float(x.ts)) - op_freq_record = {} - - op_index, freq_index = 0, 0 - while op_index < len(ai_core_ops) and freq_index < len(ai_core_freq): - op_event = ai_core_ops[op_index] - op_end_time = convert_to_float(op_event.ts) + convert_to_float(op_event.dur) - op_freq_list = [] - while freq_index < len(ai_core_freq): - freq_event = ai_core_freq[freq_index] - if convert_to_float(freq_event.end) < op_end_time: - op_freq_list.append(convert_to_float(freq_event.args.MHz)) - freq_index += 1 - continue - elif convert_to_float(freq_event.ts) < op_end_time: - if op_event.name not in op_freq_record: - op_freq_record[op_event.name] = {"count": 0, "dur": 0, "freq_list": []} - op_freq_record[op_event.name]["count"] += 1 - op_freq_record[op_event.name]["dur"] += convert_to_float(op_event.dur) - op_freq_list.append(convert_to_float(freq_event.args.MHz)) - op_freq_record[op_event.name]["freq_list"].append(min(op_freq_list)) - break - else: - break - - op_index += 1 - return op_freq_record - - def add_op(self, event): - if event.name == self.KEY_WORD: - if self._previous_freq_index != -1: - self.op_list[self._previous_freq_index]["end"] = event.get("ts", float(math.inf)) - self._previous_freq_index += 1 - event.setdefault("end", float(math.inf)) - self.op_list.append(event) - - def post_process(self, target_op_list, **kwargs): - ai_core_ops = kwargs.get("ai_core_ops", []) - if not ai_core_ops: - return - ai_core_ops.sort(key=lambda x: float(x.ts)) - op_freq = FrequencyCollector.get_op_frequency(ai_core_ops, target_op_list) - self.attribute_to_dataset["op_freq"] = op_freq - - -class SpecificTaskTypeOpCollector(BaseOpCollector): - - def __init__(self, op_type_list=None): - super().__init__() - self.op_type_list = op_type_list if op_type_list else [Constant.AI_CPU, Constant.AI_CORE, Constant.MIX_AIC] - - def add_op(self, event): - if event.args.get(Constant.TASK_TYPE) and event.args.get(Constant.TASK_TYPE) in self.op_type_list: - self.op_list.append( - TimelineEvent( - { - Constant.TASK_TYPE: event.args.get(Constant.TASK_TYPE), - "task_id": event.args.get("Task Id"), - "tid": event.tid, - "name": event.name, - "ts": str(event.ts), - "dur": str(event.dur) - } - ) - ) - - def post_process(self, target_op_list, **kwargs): - op_map = dict() - for op in target_op_list: - key = f"{op.name}-{op.ts}" - op_map[key] = op - - self.attribute_to_dataset["ops_with_task_type"] = op_map - self.attribute_to_dataset["task_op_names"] = list( - set([event_key.split("-")[0] for event_key in op_map.keys()])) - - -class TorchToNpuCollector(BaseOpCollector): - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.name.lower() == Constant.TORCH_TO_NPU: - self.op_list.append(TimelineEvent({"tid": event.tid, "ts": str(event.ts), "ph": event.ph, "id": event.id})) - - def post_process(self, target_op_list, **kwargs): - op_map = dict() - for op in target_op_list: - key = f"{op.ph}-{op.id}" - op_map[key] = op - - self.attribute_to_dataset["torch_to_npu"] = op_map - - -class AclToNpuCollector(BaseOpCollector): - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.name and event.ts and event.name == Constant.ACL_TO_NPU: - self.op_list.append(TimelineEvent({"ts": event.ts})) - - def post_process(self, target_op_list, **kwargs): - op_record = set(str(op.ts) for op in target_op_list) - self.attribute_to_dataset["acl_to_npu"] = op_record - - -class OpStackCollector(BaseOpCollector): - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.args.get(Constant.CALL_STACKS): - self.op_list.append( - TimelineEvent({"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts})) - - def post_process(self, target_op_list, **kwargs): - op_map = dict() - for op in target_op_list: - op_map[str(op.ts)] = op - - self.attribute_to_dataset["ops_with_stack"] = op_map - - -class GcCollector(BaseOpCollector): - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.cat and isinstance(event.cat, str) and event.cat.lower() == "gc": - self.op_list.append(TimelineEvent( - {"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur})) - - def post_process(self, target_op_list, **kwargs): - self.attribute_to_dataset["gc_events"] = self.op_list - - -class FreeEventsCollector(BaseOpCollector): - def __init__(self): - super().__init__() - - @staticmethod - def _load_rule(): - sync_stream_rule_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), - "rules", - "gc.yaml") - - gc_rule = FileManager.read_yaml_file(sync_stream_rule_path) - return gc_rule - - def add_op(self, event): - if event.name.lower() == Constant.FREE: - self.op_list.append(event) - - def post_process(self, target_op_list, **kwargs): - gc_rule = self._load_rule() - if os.getenv(Constant.FREE_DURATION_FOR_GC_ANALYSIS): - max_free_threshold = convert_to_float(os.getenv(Constant.FREE_DURATION_FOR_GC_ANALYSIS)) - else: - max_free_threshold = gc_rule.get("max_free_threshold") - - large_free_events = [] - - for op in target_op_list: - if convert_to_float(op.dur) > max_free_threshold: - large_free_events.append(op) - - large_free_events.sort(key=lambda x: convert_to_float(x.ts)) - self.attribute_to_dataset["large_free_events"] = large_free_events - - -class AclEventsCollector(BaseOpCollector): - ACL_EVENT_PREFIX = "AscendCL@" - - def __init__(self): - super().__init__() - - def add_op(self, event): - if event.name.startswith(self.ACL_EVENT_PREFIX): - self.op_list.append(event) - - def post_process(self, target_op_list, **kwargs): - target_op_list.sort(key=lambda x: convert_to_float(x.ts)) - self.attribute_to_dataset["acl_events"] = target_op_list +import logging +import math +import os +from abc import abstractmethod, ABCMeta + +from profiler.prof_common.additional_args_manager import AdditionalArgsManager +from profiler.prof_common.constant import Constant +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.utils.utils import convert_to_float +from profiler.prof_common.file_manager import FileManager + +logger = logging.getLogger() + + +class BaseOpCollector(metaclass=ABCMeta): + + def __init__(self): + self.attribute_to_dataset = {} + self.op_list = [] + self.require_filter_by_step = True + + @abstractmethod + def add_op(self): + """ add timeline event into self.op_list, and then will filter event in self.op_list by specific step + """ + pass + + @abstractmethod + def post_process(self): + """ convert self.op_list to required format like dict, set and so on and then record the final object into + self.attribute_to_dataset which used to set property of timeline event dataset + """ + pass + + +class StepCollector(BaseOpCollector): + KEY_WORD = "ProfilerStep" + + def __init__(self): + super().__init__() + self.require_filter_by_step = False + + def add_op(self, event): + if event.name.startswith(self.KEY_WORD): + self.op_list.append(event) + + def post_process(self, *args, **kwargs): + self.attribute_to_dataset["profiler_step"] = self.op_list + + +class OpCompileCollector(BaseOpCollector): + def __init__(self): + super().__init__() + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + @property + def total_time(self): + return self._total_op_compile_time + + @property + def total_count(self): + return self._total_op_compile_counter + + def is_empty(self): + return self._total_op_compile_counter == 0 + + def update(self, event: TimelineEvent): + self._total_op_compile_time += float(event.dur) + self._total_op_compile_counter += 1 + + def unset(self): + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + def add_op(self, event): + if event.name == Constant.OP_COMPILE_NAME or event.args.get("id") == Constant.OP_COMPILE_ID: + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + for op in target_op_list: + self.update(op) + + self.attribute_to_dataset["ops_compile"] = self + + +class SynchronizeStreamCollector(BaseOpCollector): + + def __init__(self): + super().__init__() + self.require_filter_by_step = False + + def add_op(self, event): + if event.name.startswith(Constant.SYNC_STREAM) or event.name.startswith(Constant.NODE_LAUNCH): + self.op_list.append(event) + + def post_process(self, *args, **kwargs): + self.op_list.sort(key=lambda x: x.ts) + + self.attribute_to_dataset["synchronize_stream"] = self.op_list + + +class MemCollector(BaseOpCollector): + MEMORY_OP_NAME = ["AscendCL@aclMallocMemInner", "AscendCL@aclrtFreePhysical", "AscendCL@aclrtFree"] + + def __init__(self): + super().__init__() + self.mem_op_info = {} + self.rule = self._load_rule() + + @staticmethod + def _load_rule(): + language = AdditionalArgsManager().language + memory_rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + language, + "memory.yaml") + + memory_rule = FileManager.read_yaml_file(memory_rule_path) + return memory_rule + + def add_op(self, event): + if event.name not in self.MEMORY_OP_NAME: + return + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + for op in target_op_list: + if op.name not in self.mem_op_info: + self.mem_op_info[op.name] = dict(count=0, total_dur=0) + self.mem_op_info[op.name]["count"] += 1 + self.mem_op_info[op.name]["total_dur"] += float(op.dur) + + self.attribute_to_dataset["memory_ops"] = self + + +class DataloaderCollector(BaseOpCollector): + key_word = "dataloader" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if self.key_word in event.name.lower(): + self.op_list.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur, + "stack": event.args.get("Call stack") + })) + + def post_process(self, *args, **kwargs): + self.attribute_to_dataset["dataloader"] = self.op_list + + +class SyncBNCollector(BaseOpCollector): + key_word = "syncbatchnorm" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.lower() == self.key_word: + self.op_list.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["sync_batchnorm"] = target_op_list + + +class AtenCollector(BaseOpCollector): + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.lower().startswith(f"{Constant.ATEN}{Constant.ATEN_SEP}") or event.name.lower().startswith( + f"{Constant.NPU_LOWER}{Constant.ATEN_SEP}"): + self._add_aten(event) + return + + # 检查cann层同步操作,根据时间窗口索引到host侧的aten算子并给出堆栈 + if event.name.startswith(Constant.SYNC_STREAM): + self._add_aten(event) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["aten"] = target_op_list + + def _add_aten(self, event: TimelineEvent): + self.op_list.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + +class OptimizerCollector(BaseOpCollector): + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.startswith(f"{Constant.OPTIMIZER}.{Constant.OPTIMIZER_STEP}{Constant.OPTIMIZER_SEP}"): + self.op_list.append(TimelineEvent( + {"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur})) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["optimizer"] = target_op_list + + +class FrequencyCollector(BaseOpCollector): + KEY_WORD = "AI Core Freq" + + def __init__(self): + super().__init__() + self._previous_freq_index = -1 + + @staticmethod + def get_op_frequency(ai_core_ops, ai_core_freq): + ai_core_freq.sort(key=lambda x: float(x.ts)) + op_freq_record = {} + + op_index, freq_index = 0, 0 + while op_index < len(ai_core_ops) and freq_index < len(ai_core_freq): + op_event = ai_core_ops[op_index] + op_end_time = convert_to_float(op_event.ts) + convert_to_float(op_event.dur) + op_freq_list = [] + while freq_index < len(ai_core_freq): + freq_event = ai_core_freq[freq_index] + if convert_to_float(freq_event.end) < op_end_time: + op_freq_list.append(convert_to_float(freq_event.args.MHz)) + freq_index += 1 + continue + elif convert_to_float(freq_event.ts) < op_end_time: + if op_event.name not in op_freq_record: + op_freq_record[op_event.name] = {"count": 0, "dur": 0, "freq_list": []} + op_freq_record[op_event.name]["count"] += 1 + op_freq_record[op_event.name]["dur"] += convert_to_float(op_event.dur) + op_freq_list.append(convert_to_float(freq_event.args.MHz)) + op_freq_record[op_event.name]["freq_list"].append(min(op_freq_list)) + break + else: + break + + op_index += 1 + return op_freq_record + + def add_op(self, event): + if event.name == self.KEY_WORD: + if self._previous_freq_index != -1: + self.op_list[self._previous_freq_index]["end"] = event.get("ts", float(math.inf)) + self._previous_freq_index += 1 + event.setdefault("end", float(math.inf)) + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + ai_core_ops = kwargs.get("ai_core_ops", []) + if not ai_core_ops: + return + ai_core_ops.sort(key=lambda x: float(x.ts)) + op_freq = FrequencyCollector.get_op_frequency(ai_core_ops, target_op_list) + self.attribute_to_dataset["op_freq"] = op_freq + + +class SpecificTaskTypeOpCollector(BaseOpCollector): + + def __init__(self, op_type_list=None): + super().__init__() + self.op_type_list = op_type_list if op_type_list else [Constant.AI_CPU, Constant.AI_CORE, Constant.MIX_AIC] + + def add_op(self, event): + if event.args.get(Constant.TASK_TYPE) and event.args.get(Constant.TASK_TYPE) in self.op_type_list: + self.op_list.append( + TimelineEvent( + { + Constant.TASK_TYPE: event.args.get(Constant.TASK_TYPE), + "task_id": event.args.get("Task Id"), + "tid": event.tid, + "name": event.name, + "ts": str(event.ts), + "dur": str(event.dur) + } + ) + ) + + def post_process(self, target_op_list, **kwargs): + op_map = dict() + for op in target_op_list: + key = f"{op.name}-{op.ts}" + op_map[key] = op + + self.attribute_to_dataset["ops_with_task_type"] = op_map + self.attribute_to_dataset["task_op_names"] = list( + set([event_key.split("-")[0] for event_key in op_map.keys()])) + + +class TorchToNpuCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.lower() == Constant.TORCH_TO_NPU: + self.op_list.append(TimelineEvent({"tid": event.tid, "ts": str(event.ts), "ph": event.ph, "id": event.id})) + + def post_process(self, target_op_list, **kwargs): + op_map = dict() + for op in target_op_list: + key = f"{op.ph}-{op.id}" + op_map[key] = op + + self.attribute_to_dataset["torch_to_npu"] = op_map + + +class AclToNpuCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name and event.ts and event.name == Constant.ACL_TO_NPU: + self.op_list.append(TimelineEvent({"ts": event.ts})) + + def post_process(self, target_op_list, **kwargs): + op_record = set(str(op.ts) for op in target_op_list) + self.attribute_to_dataset["acl_to_npu"] = op_record + + +class OpStackCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.args.get(Constant.CALL_STACKS): + self.op_list.append( + TimelineEvent({"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts})) + + def post_process(self, target_op_list, **kwargs): + op_map = dict() + for op in target_op_list: + op_map[str(op.ts)] = op + + self.attribute_to_dataset["ops_with_stack"] = op_map + + +class GcCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.cat and isinstance(event.cat, str) and event.cat.lower() == "gc": + self.op_list.append(TimelineEvent( + {"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur})) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["gc_events"] = self.op_list + + +class FreeEventsCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + @staticmethod + def _load_rule(): + language = AdditionalArgsManager().language + sync_stream_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + language, + "gc.yaml") + + gc_rule = FileManager.read_yaml_file(sync_stream_rule_path) + return gc_rule + + def add_op(self, event): + if event.name.lower() == Constant.FREE: + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + gc_rule = self._load_rule() + if os.getenv(Constant.FREE_DURATION_FOR_GC_ANALYSIS): + max_free_threshold = convert_to_float(os.getenv(Constant.FREE_DURATION_FOR_GC_ANALYSIS)) + else: + max_free_threshold = gc_rule.get("max_free_threshold") + + large_free_events = [] + + for op in target_op_list: + if convert_to_float(op.dur) > max_free_threshold: + large_free_events.append(op) + + large_free_events.sort(key=lambda x: convert_to_float(x.ts)) + self.attribute_to_dataset["large_free_events"] = large_free_events + + +class AclEventsCollector(BaseOpCollector): + ACL_EVENT_PREFIX = "AscendCL@" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.startswith(self.ACL_EVENT_PREFIX): + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + target_op_list.sort(key=lambda x: convert_to_float(x.ts)) + self.attribute_to_dataset["acl_events"] = target_op_list diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 804eacbaa..d20df9a76 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -19,7 +19,7 @@ from typing import List, Dict from collections import defaultdict, OrderedDict from jinja2 import Environment, FileSystemLoader -from profiler.prof_common.constant import Constant +from profiler.advisor.common import constant from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import singleton, safe_write @@ -40,7 +40,7 @@ class HTMLRender: self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", - template_header=Constant.DEFAULT_TEMPLATE_HEADER): + template_header=constant.DEFAULT_TEMPLATE_HEADER): # 确保overall 和 comparison 在 performance problem analysis 之前 sorted_render_htmls = OrderedDict() diff --git a/profiler/advisor/result/result.py b/profiler/advisor/result/result.py index eb8e38c41..40324d034 100644 --- a/profiler/advisor/result/result.py +++ b/profiler/advisor/result/result.py @@ -22,6 +22,7 @@ import click import xlsxwriter from prettytable import ALL, PrettyTable +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.constant import Constant from profiler.advisor.utils.utils import singleton, logger from profiler.advisor.config.config import Config @@ -208,11 +209,19 @@ class TerminalResult: def __init__(self): self.width, _ = self.get_terminal_size() - if self.width is None: - self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"]) + language = AdditionalArgsManager().language + if language == "en": + if self.width is None: + self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"]) + else: + self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"], + max_table_width=max(self.width - 20, 180)) else: - self.table = PrettyTable(["No.", "Category", "Description", "Suggestion"], - max_table_width=max(self.width - 20, 180)) + if self.width is None: + self.table = PrettyTable(["No.", "类型", "描述", "建议"]) + else: + self.table = PrettyTable(["No.", "类型", "描述", "建议"], + max_table_width=max(self.width - 20, 180)) self.table.hrules = ALL self.result_list = [] diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index 7c93d224c..65b24319b 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -46,6 +46,10 @@ def analyze_cli(**kwargs): @click.option("--force", is_flag=True, help="Indicates whether to skip file size verification and owner verification") +@click.option("--language", + type=click.Choice(["cn", "en"]), + default="cn", + help="Language of Advisor html") @debug_option def analyze_all(**kwargs) -> None: try: diff --git a/profiler/test/ut/advisor/timeline_advice/test_dataloader_checker.py b/profiler/test/ut/advisor/timeline_advice/test_dataloader_checker.py index f2bd26957..0168005e0 100644 --- a/profiler/test/ut/advisor/timeline_advice/test_dataloader_checker.py +++ b/profiler/test/ut/advisor/timeline_advice/test_dataloader_checker.py @@ -16,7 +16,7 @@ class TestDataloaderChecker(unittest.TestCase): def setUp(self) -> None: rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))), - "advisor", "rules", "dataloader.yaml") + "advisor", "rules", "en", "dataloader.yaml") with open(rule_path, "rb") as file: self.rule = yaml.safe_load(file) diff --git a/profiler/test/ut/advisor/timeline_advice/test_memory_op_checker.py b/profiler/test/ut/advisor/timeline_advice/test_memory_op_checker.py index a5326b989..5dea7842d 100644 --- a/profiler/test/ut/advisor/timeline_advice/test_memory_op_checker.py +++ b/profiler/test/ut/advisor/timeline_advice/test_memory_op_checker.py @@ -1,62 +1,62 @@ -import unittest -import os -import sys -import yaml - -from profiler.advisor.analyzer.memory.memory_checker import MemoryOpsChecker -from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env - - -class TestMemOpChecker(unittest.TestCase): - @classmethod - def tearDownClass(cls) -> None: - recover_env() - - def setUp(self) -> None: - rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))), - "advisor", "rules", "memory.yaml") - - with open(rule_path, "rb") as file: - self.rule = yaml.safe_load(file) - - def test_no_mem_op(self): - dataset = self._get_mock_dataset(1, is_empty_dataset=True) - - checker = MemoryOpsChecker() - checker.check_memory_ops(dataset) - self.assertFalse(checker.memory_issues) - - def test_mem_op_not_reach_threshold(self): - dataset = self._get_mock_dataset(1, is_empty_dataset=False) - - checker = MemoryOpsChecker() - checker.check_memory_ops(dataset) - self.assertFalse(checker.memory_issues) - - def test_mem_op_reach_threshold(self): - dataset = self._get_mock_dataset(1, 1000000, is_empty_dataset=False) - - checker = MemoryOpsChecker() - checker.check_memory_ops(dataset) - self.assertTrue(checker.memory_issues) - - def _get_mock_dataset(self, mem_op_num, mem_op_total_dur=1000, is_empty_dataset=False): - dataset = TimelineEvent() - if is_empty_dataset: - return dataset - - mem_op_info = TimelineEvent() - for i in range(mem_op_num): - mem_op_info[f"mock_mem_op_{i}"] = TimelineEvent({"total_dur": mem_op_total_dur, "count": 10}) - - dataset["memory_ops"] = TimelineEvent({"mem_op_info": mem_op_info, "rule": TimelineEvent(self.rule)}) - return dataset - - -if __name__ == '__main__': - tester = TestMemOpChecker() - tester.test_no_mem_op() - tester.test_mem_op_not_reach_threshold() +import unittest +import os +import sys +import yaml + +from profiler.advisor.analyzer.memory.memory_checker import MemoryOpsChecker +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env + + +class TestMemOpChecker(unittest.TestCase): + @classmethod + def tearDownClass(cls) -> None: + recover_env() + + def setUp(self) -> None: + rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( + os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))), + "advisor", "rules", "en", "memory.yaml") + + with open(rule_path, "rb") as file: + self.rule = yaml.safe_load(file) + + def test_no_mem_op(self): + dataset = self._get_mock_dataset(1, is_empty_dataset=True) + + checker = MemoryOpsChecker() + checker.check_memory_ops(dataset) + self.assertFalse(checker.memory_issues) + + def test_mem_op_not_reach_threshold(self): + dataset = self._get_mock_dataset(1, is_empty_dataset=False) + + checker = MemoryOpsChecker() + checker.check_memory_ops(dataset) + self.assertFalse(checker.memory_issues) + + def test_mem_op_reach_threshold(self): + dataset = self._get_mock_dataset(1, 1000000, is_empty_dataset=False) + + checker = MemoryOpsChecker() + checker.check_memory_ops(dataset) + self.assertTrue(checker.memory_issues) + + def _get_mock_dataset(self, mem_op_num, mem_op_total_dur=1000, is_empty_dataset=False): + dataset = TimelineEvent() + if is_empty_dataset: + return dataset + + mem_op_info = TimelineEvent() + for i in range(mem_op_num): + mem_op_info[f"mock_mem_op_{i}"] = TimelineEvent({"total_dur": mem_op_total_dur, "count": 10}) + + dataset["memory_ops"] = TimelineEvent({"mem_op_info": mem_op_info, "rule": TimelineEvent(self.rule)}) + return dataset + + +if __name__ == '__main__': + tester = TestMemOpChecker() + tester.test_no_mem_op() + tester.test_mem_op_not_reach_threshold() tester.test_mem_op_reach_threshold() \ No newline at end of file diff --git a/profiler/test/ut/advisor/timeline_advice/test_syncbn_checker.py b/profiler/test/ut/advisor/timeline_advice/test_syncbn_checker.py index ecd4ee6cc..bb627a943 100644 --- a/profiler/test/ut/advisor/timeline_advice/test_syncbn_checker.py +++ b/profiler/test/ut/advisor/timeline_advice/test_syncbn_checker.py @@ -16,7 +16,7 @@ class TestSyncBNChecker(unittest.TestCase): def setUp(self) -> None: rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))), - "advisor", "rules", "sync_batchnorm.yaml") + "advisor", "rules", "en", "sync_batchnorm.yaml") with open(rule_path, "rb") as file: self.rule = yaml.safe_load(file) diff --git a/profiler/test/ut/advisor/timeline_advice/test_synchronize_stream.py b/profiler/test/ut/advisor/timeline_advice/test_synchronize_stream.py index 8f5d3d4ca..44afa86db 100644 --- a/profiler/test/ut/advisor/timeline_advice/test_synchronize_stream.py +++ b/profiler/test/ut/advisor/timeline_advice/test_synchronize_stream.py @@ -18,7 +18,7 @@ class TestSynchronizeChecker(unittest.TestCase): def setUp(self) -> None: rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))), - "advisor", "rules", "synchronize.yaml") + "advisor", "rules", "en", "synchronize.yaml") with open(rule_path, "rb") as file: self.rule = yaml.safe_load(file) -- Gitee From 7f3c16e2d83f3f61693f5c2e40d8ec70ba209ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E5=AE=9D?= Date: Mon, 9 Dec 2024 19:49:24 +0800 Subject: [PATCH 3/8] bugfix --- .../fusion_ops/fusion_ops_analyzer.py | 4 +- profiler/advisor/common/constant.py | 163 ------------------ profiler/prof_common/constant.py | 4 +- 3 files changed, 5 insertions(+), 166 deletions(-) delete mode 100644 profiler/advisor/common/constant.py diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index feacbb37b..6b549ad0c 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -103,7 +103,7 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): suggestion = "Please replace training api according to sub table 'Affinity training api'" if self.empty_stacks: desc += ", but with no stack" - suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT.format( + suggestion = Constant.TIMELINE_EMPTY_STACKS_PROMPT.format( timeline_profiling_doc_url=Config().timeline_with_stack_doc_url ) sheet_name = "Affinity apis" @@ -113,7 +113,7 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): suggestion = "请根据子表'Affinity training api'替换训练api接口" if self.empty_stacks: desc += ",但没有堆栈" - suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT_CN.format( + suggestion = Constant.TIMELINE_EMPTY_STACKS_PROMPT_CN.format( timeline_profiling_doc_url=Config().timeline_with_stack_doc_url ) sheet_name = "亲和API接口" diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py deleted file mode 100644 index cc76f8e01..000000000 --- a/profiler/advisor/common/constant.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import stat - -# timeline -DEQUEUE = "Dequeue" -DEQUEUE_SEP = "@" -ATEN = "aten" -NPU = "npu" -ATEN_SEP = "::" -OPTIMIZER = "Optimizer" -OPTIMIZER_SEP = "#" -OPTIMIZER_STEP = "step" -ENQUEUE = "enqueue" -TORCH_TO_NPU = "torch_to_npu" -FREE = "free" -OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" -OP_COMPILE_ID = "aclopCompileAndExecute" -SYNC_STREAM = "AscendCL@aclrtSynchronizeStream" -NODE_LAUNCH = "Node@launch" -MAX_OP_COMPILE_NUM = 20 -ACL_TO_NPU = "acl_to_npu" -TASK_TYPE = "Task Type" -CPU_OP = "cpu_op" -AI_CORE = "AI_CORE" -AI_CPU = "AI_CPU" -MIX_AIC = "MIX_AIC" -CALL_STACKS = "Call stack" -INPUT_DIMS = "Input Dims" -OP_SEP = "-" -ADVISOR_MAX_PROCESSES = 8 -ADVISOR_ANALYZE_PROCESSES = "ADVISOR_ANALYZE_PROCESSES" -TIMELINE_OP_STACKS_DATASET = "timeline_op_stacks_dataset" -TIMELINE_BACKWARD_NO_STACK = "Backward broadcast, without call stacks in profiling." -TIMELINE_ACL_TO_NPU_NO_STACK = "Incoming flow is 'acl_to_npu', without call stacks in profiling." -TIMELINE_BACKWARD_NO_STACK_CODE = -1 -TIMELINE_ACL_TO_NPU_NO_STACK_CODE = -2 -TIMELINE_FUSION_OPS_NO_STACK_FLAG = "NO STACK" -NO_STACK_REASON_MAP = { - TIMELINE_BACKWARD_NO_STACK_CODE: "Backward broadcast, without call stacks in profiling.", - TIMELINE_ACL_TO_NPU_NO_STACK_CODE: "Incoming flow is 'acl_to_npu', without call stacks in profiling." -} -AFFINITY_TRAINING_API = "Affinity training api" -TIMELINE_EMPTY_STACKS_PROMPT = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ - "please refer to {timeline_profiling_doc_url} to set 'with_stack=True'. " \ - "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." -TIMELINE_EMPTY_STACKS_PROMPT_CN = "这些API接口没有代码堆栈。如果采集profiling时参数为'with_stack=False'," \ - "请参考{timeline_profiling_doc_url}设置'with_stack=True'。" \ - "另外,由于反向传播没有堆栈,请忽略以下亲和APIs。" -CLUSTER_ANALYSIS = "Cluster analysis" -SLOW_RANK_TIME_RATIO_THRESHOLD = 0.05 - -CANN_VERSION = "cann_version" -TORCH_VERSION = "torch_version" -PROFILING_TYPE = "profiling_type" -ANALYSIS_DIMENSIONS = "analysis_dimensions" - -PROFILER_METADATA = "profiler_metadata.json" - -TERMINAL_OUTPUT_HEADERS = ["No.", "Problem", "Description", "Suggestion"] -SKIP_ANALYZE_PROMPT = "Finish analysis, no optimization suggestions" -SKIP_QUERY_PROMPT = "Finish query operator stack, no operators" - -# operator output constant -OPERATOR_OUT_TOPK = 10 -OPERATOR_LIST_UNLIMIT = -1 - -DEFAULT_OPERATOR_TYPE = 'None_type' -DEFAULT_DURATION_ZERO = 0.0 - -ADVISOR_LOG_LEVEL = "ADVISOR_LOG_LEVEL" -DEFAULT_LOG_LEVEL = "INFO" -SUPPORTED_LOG_LEVEL = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] - -RULE_BUCKET = "RULE-BUCKET" -CLOUD_RULE_REGION_CN_NORTH_9 = "cn-north-9" -CLOUD_RULE_REGION_CN_NORTH_7 = "cn-north-7" -CLOUD_RULE_REGION_CN_SOUTHWEST_2 = "cn-southwest-2" -CLOUD_RULE_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7, CLOUD_RULE_REGION_CN_NORTH_9, CLOUD_RULE_REGION_CN_SOUTHWEST_2] -INNER_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7] -DEFAULT_CLOUD_RULE_REGION = CLOUD_RULE_REGION_CN_SOUTHWEST_2 - -HTTP_PREFIXES = "http://" -HTTPS_PREFIXES = "https://" -COMMON_YAML_DIR = "modelarts/solution/ma_advisor_rules/" -COMMON_ENDPOINT_SUFFIX = "obs.{}.myhuaweicloud.com" -INNER_ENDPOINT_SUFFIX = "obs.{}.ulanqab.huawei.com" - -AICPU_RULES_YAML_NAME = "aicpu_rules.yaml" -FUSION_PASS_YAML_NAME = "op_fusion_pass.yaml" -TIMELINE_FUSION_OPS_YAML_NAME = "timeline_fusion_ops.yaml" -CLOUD_YAML_NAME_LIST = [AICPU_RULES_YAML_NAME, FUSION_PASS_YAML_NAME, TIMELINE_FUSION_OPS_YAML_NAME] - -MAX_RETRIES = 3 -TIMEOUT = 3 -DEPTH_LIMIT = 20 - -ADVISOR_RULE_PATH = "ADVISOR_RULE_PATH" -CLOUD_RULE_PATH = "rules/cloud/" -DEFAULT_RULE_PATH = "./rules/" - -TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID = -1 - -DEFAULT_TEMPLATE_HEADER = "Performance Optimization Suggestions" - -PT_PROF_SUFFIX = "ascend_pt" -ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT" -COLLECTION_PATH = "collection_path" -CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" -KERNEL_DETAILS_CSV = "kernel_details.csv" -CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" -CLUSTER_COMM_JSON = "cluster_communication.json" -COMMUNICATION_JSON = "communication.json" - -BOTTLENECK = "bottleneck" -DATA = "data" -ADVISOR_ANALYSIS_OUTPUT_DIR = "advisor_analysis_result" -DEFAULT_PROCESSES = 8 -CLUSTER_ANALYSIS_FILE_PATTERN = [ - r'profiler_info_\d+\.json', "step_trace_time.csv", "communication.json", "communication_matrix.json" -] -ANALYSIS_OUTPUT_PATH = "ANALYSIS_OUTPUT_PATH" -DEFAULT_RANK_FOR_PROFILING_ANALYSIS = 0 -PROFILER_INFO_FILE_PATTERN = r"profiler_info_(\d+)\.json" -DISABLE_STREAMINIG_READER = "DISABLE_STREAMINIG_READER" -FRAMEWORK_STACK_BLACK_LIST = ["torch", "torch_npu", "megatron", "deepspeed"] -DISABLE_STREAMING_READER = "DISABLE_STREAMING_READER" -MAX_FILE_SIZE = 10 ** 10 -MAX_NUM_PROCESSES = 4 -DEFAULT_STEP = "-1" -STEP_RANK_SEP = "_" - -MAX_READ_LINE_BYTES = 8196 * 1024 -MAX_READ_FILE_BYTES = 64 * 1024 * 1024 * 1024 -MAX_READ_DB_FILE_BYTES = 8 * 1024 * 1024 * 1024 - -# Unit Conversion -COMMUNICATION_B_TO_GB = 0.001 ** 3 -US_TO_S = 0.001 ** 2 - -WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP -WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC - -DISABLE_PROFILING_COMPARISON = "DISABLE_PROFILING_COMPARISON" -FREE_DURATION_FOR_GC_ANALYSIS = "FREE_DURATION_FOR_GC_ANALYSIS" -DISABLE_AFFINITY_API = "DISABLE_AFFINITY_API" - -# communication.json -TOTAL_OP_INFO = "Total Op Info" diff --git a/profiler/prof_common/constant.py b/profiler/prof_common/constant.py index 95a3e604a..dcea8bb86 100644 --- a/profiler/prof_common/constant.py +++ b/profiler/prof_common/constant.py @@ -286,7 +286,9 @@ class Constant(object): TIMELINE_EMPTY_STACKS_PROMPT = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ "please refer to {timeline_profiling_doc_url} to set 'with_stack=True'. " \ "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." - + TIMELINE_EMPTY_STACKS_PROMPT_CN = "这些API接口没有代码堆栈。如果采集profiling时参数为'with_stack=False'," \ + "请参考{timeline_profiling_doc_url}设置'with_stack=True'。" \ + "另外,由于反向传播没有堆栈,请忽略以下亲和APIs。" CLUSTER_ANALYSIS = "Cluster analysis" SLOW_RANK_TIME_RATIO_THRESHOLD = 0.05 -- Gitee From 147b59d3d12e3ab0b5d3e2b9e5d7311e6e5279f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E5=AE=9D?= Date: Mon, 9 Dec 2024 19:57:33 +0800 Subject: [PATCH 4/8] bugfix2 --- profiler/advisor/analyzer/cluster/slow_link_analyzer.py | 2 +- profiler/advisor/display/html/render.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py index 57c4694c0..2f131509d 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py @@ -18,7 +18,7 @@ from typing import Dict, List import logging from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer -from profiler.advisor.common import constant +from profiler.prof_common.constant import Constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index d20df9a76..804eacbaa 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -19,7 +19,7 @@ from typing import List, Dict from collections import defaultdict, OrderedDict from jinja2 import Environment, FileSystemLoader -from profiler.advisor.common import constant +from profiler.prof_common.constant import Constant from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import singleton, safe_write @@ -40,7 +40,7 @@ class HTMLRender: self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", - template_header=constant.DEFAULT_TEMPLATE_HEADER): + template_header=Constant.DEFAULT_TEMPLATE_HEADER): # 确保overall 和 comparison 在 performance problem analysis 之前 sorted_render_htmls = OrderedDict() -- Gitee From cb6f275fc9eda298d3da8976d4d2f53a945c1285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E5=AE=9D?= Date: Mon, 9 Dec 2024 20:04:36 +0800 Subject: [PATCH 5/8] bugfix3 --- profiler/advisor/analyzer/overall/overall_summary_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index 90b83e82c..fa67cb01a 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -20,6 +20,7 @@ from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.result.result import OptimizeResult from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface +from profiler.prof_common.constant import Constant from profiler.prof_common.additional_args_manager import AdditionalArgsManager -- Gitee From e0190839748e9b71aa4b6c4528f8c25e3c22f004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E5=AE=9D?= Date: Mon, 16 Dec 2024 21:25:21 +0800 Subject: [PATCH 6/8] =?UTF-8?q?=E4=BC=98=E5=8C=96advicemap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../overall_advice/overall_summary_advice.py | 57 ++--------- .../overall/overall_summary_analyzer.py | 81 +++------------- profiler/cli/analyze_cli.py | 2 +- profiler/prof_common/constant.py | 95 ++++++++++++++++++- 4 files changed, 117 insertions(+), 118 deletions(-) diff --git a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py index 05e545bef..bfaf6bb00 100644 --- a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py +++ b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py @@ -23,53 +23,6 @@ from profiler.prof_common.additional_args_manager import AdditionalArgsManager class OverallSummaryAdvice(AdviceBase): - language = AdditionalArgsManager().language - if language == "en": - advice_map = { - "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", - "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", - "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." - } - time_name_map = { - "Computing Time": "computing", - "Uncovered Communication Time": "communication", - "Free Time": "free", - 'Cube Time(Num)': 'Cube Time', - 'Vector Time(Num)': 'Vector Time', - 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', - 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', - 'Other Time': "Other Computing Time", - 'SDMA Time(Num)': 'SDMA Time' - } - performance_time_dict = { - "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', - 'Flash Attention Time(Backward)(Num)', 'Other Time'], - "Uncovered Communication Time(Wait Time)": [], - "Free Time": ['SDMA Time(Num)'] - } - else: - advice_map = { - "计算时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor computation.", - "未被掩盖的通信时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule.", - "空闲时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule." - } - time_name_map = { - "计算时长": "computing", - "未被掩盖的通信时长": "communication", - "空闲时长": "free", - 'Cube算子时长(数量)': 'Cube Time', - 'Vector算子时长(数量)': 'Vector Time', - 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', - 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', - '其它时长': "Other Computing Time", - 'SDMA时长(数量)': 'SDMA Time' - } - performance_time_dict = { - "计算时长": ['Cube时长(数量)', 'Vector时长(数量)', 'Flash Attention时长(前向)(数量)', - 'Flash Attention时长(反向)(数量)', '其它时长'], - "未被掩盖的通信时长(等待时长)": [], - "空闲时长": ['SDMA Time(Num)'] - } def __init__(self, collection_path: str, kwargs: dict): super().__init__(collection_path) @@ -83,6 +36,16 @@ class OverallSummaryAdvice(AdviceBase): self._base_data = [] self._comparison_data = [] + language = AdditionalArgsManager().language + if language == "en": + self.advice_map = Constant.ADVISOR_ADVICE_MAP + self.time_name_map = Constant.TIME_NAME_MAP + self.performance_time_dict = Constant.ADVISOR_PERFORMANCE_TIME_DICT + else: + self.advice_map = Constant.ADVISOR_ADVICE_MAP_CN + self.time_name_map = Constant.TIME_NAME_MAP_CN + self.performance_time_dict = Constant.ADVISOR_PERFORMANCE_TIME_DICT_CN + @staticmethod def split_duration_and_num(time_value: str) -> tuple: split_data = time_value.split("s") # time value example: 0.229s(1756) diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index fa67cb01a..d8e3b5701 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -25,75 +25,6 @@ from profiler.prof_common.additional_args_manager import AdditionalArgsManager class OverallSummaryAnalyzer(BaseAnalyzer): - language = AdditionalArgsManager().language - if language == "en": - OVERALL_SUMMARY_ANALYZER = "overall summary" - advice_map = { - "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", - "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", - "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" - } - time_name_map = { - "Computing Time": "computing", - "Uncovered Communication Time": "communication", - "Free Time": "free", - 'Cube Time(Num)': 'Cube Time', - 'Vector Time(Num)': 'Vector Time', - 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', - 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', - 'Other Time': "Other Computing Time", - 'SDMA Time(Num)': 'SDMA Time' - } - performance_time_dict = { - "Computing Time": "computing_time_ms", - " -- Flash Attention": "fa_time_ms", - " -- Conv": "conv_time_ms", - " -- Matmul": "matmul_time_ms", - " -- Vector": "vector_time_ms", - " -- SDMA(Tensor Move)": "tensor_move_time_ms", - " -- Other Cube": "other_cube_time_ms", - "Uncovered Communication Time": "uncovered_communication_time_ms", - " -- Wait": "wait_time_ms", - " -- Transmit": "transmit_time_ms", - "Free Time": "free_time_ms", - " -- SDMA": "sdma_time_ms", - " -- Free": "free_ms", - "E2E Time": "e2e_time_ms" - } - else: - OVERALL_SUMMARY_ANALYZER = "整网耗时分析" - advice_map = { - "计算时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", - "未被掩盖的通信时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", - "空闲时长": "如果你想了解更多详细建议请看mstt_advisor_*.html" - } - time_name_map = { - "计算时长": "computing", - "未被掩盖的通信时长": "communication", - "空闲时长": "free", - 'Cube算子时长(数量)': 'Cube Time', - 'Vector算子时长(数量)': 'Vector Time', - 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', - 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', - '其它时长': "Other Computing Time", - 'SDMA时长(数量)': 'SDMA Time' - } - performance_time_dict = { - "计算时长": "computing_time_ms", - " -- Flash Attention": "fa_time_ms", - " -- Conv": "conv_time_ms", - " -- Matmul": "matmul_time_ms", - " -- Vector": "vector_time_ms", - " -- SDMA(Tensor Move)": "tensor_move_time_ms", - " -- 其它Cube": "other_cube_time_ms", - "未被掩盖的通信时长": "uncovered_communication_time_ms", - " -- 等待时长": "wait_time_ms", - " -- 传输时长": "transmit_time_ms", - "空闲时长": "free_time_ms", - " -- SDMA": "sdma_time_ms", - " -- 空闲时长": "free_ms", - "E2E时长": "e2e_time_ms" - } def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): profile_path = get_profile_path(collection_path) @@ -111,6 +42,18 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.bottleneck_str = "" self.over_summary_analysis = {} + language = AdditionalArgsManager + if language == "en": + self.over_summary_analyzer = Constant.OVERALL_SUMMARY_ANALYZER + self.advice_map = Constant.ANALYZER_ADVICE_MAP + self.time_name_map = Constant.TIME_NAME_MAP + self.performance_time_dict = Constant.ANALYZER_PERFORMANCE_TIME_DICT + else: + self.over_summary_analyzer = Constant.OVERALL_SUMMARY_ANALYZER_CN + self.advice_map = Constant.ANALYZER_ADVICE_MAP_CN + self.time_name_map = Constant.TIME_NAME_MAP_CN + self.performance_time_dict = Constant.ANALYZER_PERFORMANCE_TIME_DICT_CN + @staticmethod def calculate_ratio(dividend, divisor): if not divisor: diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index 65b24319b..59a4da80c 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -46,7 +46,7 @@ def analyze_cli(**kwargs): @click.option("--force", is_flag=True, help="Indicates whether to skip file size verification and owner verification") -@click.option("--language", +@click.option("--language", "-l", type=click.Choice(["cn", "en"]), default="cn", help="Language of Advisor html") diff --git a/profiler/prof_common/constant.py b/profiler/prof_common/constant.py index dcea8bb86..336ebfb47 100644 --- a/profiler/prof_common/constant.py +++ b/profiler/prof_common/constant.py @@ -386,4 +386,97 @@ class Constant(object): MINDSPORE_VERSION = "mindspore_version" PYTORCH = "pytorch" - MINDSPORE = "mindspore" \ No newline at end of file + MINDSPORE = "mindspore" + + # overall summary advice + ADVISOR_ADVICE_MAP = { + "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", + "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", + "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." + } + TIME_NAME_MAP = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + ADVISOR_PERFORMANCE_TIME_DICT = { + "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', + 'Flash Attention Time(Backward)(Num)', 'Other Time'], + "Uncovered Communication Time(Wait Time)": [], + "Free Time": ['SDMA Time(Num)'] + } + ADVISOR_ADVICE_MAP_CN = { + "计算时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor computation.", + "未被掩盖的通信时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule.", + "空闲时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule." + } + TIME_NAME_MAP_CN = { + "计算时长": "computing", + "未被掩盖的通信时长": "communication", + "空闲时长": "free", + 'Cube算子时长(数量)': 'Cube Time', + 'Vector算子时长(数量)': 'Vector Time', + 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', + 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', + '其它时长': "Other Computing Time", + 'SDMA时长(数量)': 'SDMA Time' + } + ADVISOR_PERFORMANCE_TIME_DICT_CN = { + "计算时长": ['Cube时长(数量)', 'Vector时长(数量)', 'Flash Attention时长(前向)(数量)', + 'Flash Attention时长(反向)(数量)', '其它时长'], + "未被掩盖的通信时长(等待时长)": [], + "空闲时长": ['SDMA Time(Num)'] + } + + # overall summary analyzer + OVERALL_SUMMARY_ANALYZER = "overall summary" + ANALYZER_ADVICE_MAP = { + "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" + } + ANALYZER_PERFORMANCE_TIME_DICT = { + "Computing Time": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- Other Cube": "other_cube_time_ms", + "Uncovered Communication Time": "uncovered_communication_time_ms", + " -- Wait": "wait_time_ms", + " -- Transmit": "transmit_time_ms", + "Free Time": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- Free": "free_ms", + "E2E Time": "e2e_time_ms" + } + OVERALL_SUMMARY_ANALYZER_CN = "整网耗时分析" + ANALYZER_ADVICE_MAP_CN = { + "计算时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", + "未被掩盖的通信时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", + "空闲时长": "如果你想了解更多详细建议请看mstt_advisor_*.html" + } + ANALYZER_PERFORMANCE_TIME_DICT_CN = { + "计算时长": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- 其它Cube": "other_cube_time_ms", + "未被掩盖的通信时长": "uncovered_communication_time_ms", + " -- 等待时长": "wait_time_ms", + " -- 传输时长": "transmit_time_ms", + "空闲时长": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- 空闲时长": "free_ms", + "E2E时长": "e2e_time_ms" + } + -- Gitee From 3e5984e948ca163d2b0bc9de78591fbc80439787 Mon Sep 17 00:00:00 2001 From: xiao-yamin Date: Wed, 18 Dec 2024 11:53:59 +0800 Subject: [PATCH 7/8] =?UTF-8?q?=E9=87=8D=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../overall_advice/overall_summary_advice.py | 15 +-- .../Communication_retransmission_checker.py | 17 +--- .../alignment/byte_alignment_checker.py | 17 ++-- .../bandwidth_contention_checker.py | 19 ++-- .../communication/packet/packet_checker.py | 16 +-- .../communication_retransmission_checker.py | 16 +-- .../ai_core_freq/ai_core_freq_checker.py | 34 +++---- .../op_compile/dynamic_shape_checker.py | 64 +++++------- .../analyzer/computation/operator_checker.py | 91 ++++++----------- .../graph_fusion/graph_fusion_checker.py | 18 ++-- .../overall/environment_variable_checker.py | 24 ++--- .../overall/overall_summary_analyzer.py | 28 +++--- .../dispatch/timeline_op_dispatch_analyzer.py | 29 +++--- .../fusion_ops/fusion_ops_analyzer.py | 50 ++++------ .../analyzer/schedule/gc/gc_checker.py | 21 ++-- profiler/advisor/common/analyzer_scopes.py | 72 +++++--------- profiler/advisor/display/prompt/__init__.py | 0 .../advisor/display/prompt/base_prompt.py | 27 +++++ .../advisor/display/prompt/cn/__init__.py | 0 .../display/prompt/cn/ai_core_freq_prompt.py | 21 ++++ .../display/prompt/cn/dynamic_shape_prompt.py | 24 +++++ .../prompt/cn/environment_variable_prompt.py | 19 ++++ .../display/prompt/cn/fusion_ops_prompt.py | 23 +++++ .../display/prompt/cn/graph_fusion_prompt.py | 19 ++++ .../display/prompt/cn/operator_prompt.py | 31 ++++++ .../cn/overall_summary_advice_prompt.py | 38 +++++++ .../cn/overall_summary_analyzer_prompt.py | 49 ++++++++++ .../prompt/cn/timeline_op_dispatch_prompt.py | 21 ++++ .../advisor/display/prompt/en/__init__.py | 0 .../display/prompt/en/ai_core_freq_prompt.py | 36 +++++++ .../display/prompt/en/dynamic_shape_prompt.py | 24 +++++ .../prompt/en/environment_variable_prompt.py | 19 ++++ .../display/prompt/en/fusion_ops_prompt.py | 23 +++++ .../display/prompt/en/graph_fusion_prompt.py | 19 ++++ .../display/prompt/en/operator_prompt.py | 31 ++++++ .../en/overall_summary_advice_prompt.py | 38 +++++++ .../en/overall_summary_analyzer_prompt.py | 49 ++++++++++ .../prompt/en/timeline_op_dispatch_prompt.py | 22 +++++ .../rules/cn/bandwidth_contention.yaml | 3 +- profiler/advisor/rules/cn/byte_alignment.yaml | 3 +- .../rules/cn/environment_variable_info.yaml | 2 +- profiler/advisor/rules/cn/gc.yaml | 1 + profiler/advisor/rules/cn/packet.yaml | 3 +- profiler/advisor/rules/cn/rdma_analysis.yaml | 3 +- .../rules/en/bandwidth_contention.yaml | 3 +- profiler/advisor/rules/en/byte_alignment.yaml | 3 +- profiler/advisor/rules/en/gc.yaml | 1 + profiler/advisor/rules/en/packet.yaml | 3 +- profiler/advisor/rules/en/rdma_analysis.yaml | 3 +- profiler/cli/analyze_cli.py | 22 ++++- .../prof_common/additional_args_manager.py | 4 +- profiler/prof_common/constant.py | 98 ------------------- 52 files changed, 767 insertions(+), 449 deletions(-) create mode 100644 profiler/advisor/display/prompt/__init__.py create mode 100644 profiler/advisor/display/prompt/base_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/__init__.py create mode 100644 profiler/advisor/display/prompt/cn/ai_core_freq_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/dynamic_shape_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/environment_variable_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/fusion_ops_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/graph_fusion_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/operator_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/overall_summary_advice_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/overall_summary_analyzer_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/timeline_op_dispatch_prompt.py create mode 100644 profiler/advisor/display/prompt/en/__init__.py create mode 100644 profiler/advisor/display/prompt/en/ai_core_freq_prompt.py create mode 100644 profiler/advisor/display/prompt/en/dynamic_shape_prompt.py create mode 100644 profiler/advisor/display/prompt/en/environment_variable_prompt.py create mode 100644 profiler/advisor/display/prompt/en/fusion_ops_prompt.py create mode 100644 profiler/advisor/display/prompt/en/graph_fusion_prompt.py create mode 100644 profiler/advisor/display/prompt/en/operator_prompt.py create mode 100644 profiler/advisor/display/prompt/en/overall_summary_advice_prompt.py create mode 100644 profiler/advisor/display/prompt/en/overall_summary_analyzer_prompt.py create mode 100644 profiler/advisor/display/prompt/en/timeline_op_dispatch_prompt.py diff --git a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py index bfaf6bb00..bf23de569 100644 --- a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py +++ b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py @@ -36,15 +36,18 @@ class OverallSummaryAdvice(AdviceBase): self._base_data = [] self._comparison_data = [] + self._init_prompt_by_language() + + def _init_prompt_by_language(self): language = AdditionalArgsManager().language if language == "en": - self.advice_map = Constant.ADVISOR_ADVICE_MAP - self.time_name_map = Constant.TIME_NAME_MAP - self.performance_time_dict = Constant.ADVISOR_PERFORMANCE_TIME_DICT + from profiler.advisor.display.prompt.en.overall_summary_advice_prompt import OverallSummaryAdvicePrompt else: - self.advice_map = Constant.ADVISOR_ADVICE_MAP_CN - self.time_name_map = Constant.TIME_NAME_MAP_CN - self.performance_time_dict = Constant.ADVISOR_PERFORMANCE_TIME_DICT_CN + from profiler.advisor.display.prompt.cn.overall_summary_advice_prompt import OverallSummaryAdvicePrompt + + self.advice_map = OverallSummaryAdvicePrompt.PERFORMANCE_TIME_DICT + self.time_name_map = OverallSummaryAdvicePrompt.TIME_NAME_MAP + self.performance_time_dict = OverallSummaryAdvicePrompt.PERFORMANCE_TIME_DICT @staticmethod def split_duration_and_num(time_value: str) -> tuple: diff --git a/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py b/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py index 56dde89fb..ce5128bcc 100644 --- a/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py +++ b/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py @@ -17,6 +17,7 @@ import os from typing import Dict, List from collections import defaultdict from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset +from profiler.advisor.display.prompt.base_prompt import BasePrompt from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.cluster_analyse.common_func.file_manager import FileManager @@ -100,19 +101,10 @@ class CommunicationRetransmissionChecker: """ make record for what and how to optimize """ - language = AdditionalArgsManager().language - if language == "en": - problem_str = "Communication retransmission analysis" - else: - problem_str = "通信重传分析" - optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) + optimization_item = OptimizeItem(self.problem, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - if language == "en": - sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" - else: - sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" - + sub_table_name = BasePrompt.get_sub_table_name(self.problem, self.stage) result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_rdma_list: @@ -138,7 +130,8 @@ class CommunicationRetransmissionChecker: ) syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) - self.desc = syncbn_rule.get("problem") + self.problem = syncbn_rule.get("problem") + self.desc = syncbn_rule.get("description") self.min_retransmission_time = syncbn_rule.get("min_retransmission_time") self.solutions = syncbn_rule.get("solutions") diff --git a/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py b/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py index 993f04ea8..86b79c4fd 100644 --- a/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py +++ b/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py @@ -18,6 +18,7 @@ from typing import List from profiler.advisor.dataset.communication.hccl_detail_dataset import HcclDetailDataset from profiler.advisor.dataset.profiling.info_collection import HcclTask from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.advisor.display.prompt.base_prompt import BasePrompt from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.prof_common.additional_args_manager import AdditionalArgsManager @@ -83,19 +84,12 @@ class ByteAlignmentChecker: """ make record for what and how to optimize """ - language = AdditionalArgsManager().language - if language == "en": - problem_str = "Byte Alignment Analysis" - else: - problem_str = "字节对齐分析" - optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) + optimization_item = OptimizeItem(self.problem, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - if language == "en": - sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str} " - else: - sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str} " + sub_table_name = BasePrompt.get_sub_table_name(self.problem, self.stage) result.add_detail(sub_table_name, headers=self.headers) + for hccl_op in self.abnormal_ops: result.add_detail(sub_table_name, detail=hccl_op) @@ -148,7 +142,8 @@ class ByteAlignmentChecker: ) byte_alignment_rule = FileManager.read_yaml_file(rule_path) - self.desc = byte_alignment_rule.get("problem") + self.problem = byte_alignment_rule.get("problem") + self.desc = byte_alignment_rule.get("description") self.min_size = byte_alignment_rule.get("min_size", self._MIN_SIZE) self.topk = byte_alignment_rule.get("top_num", 3) self.solutions = byte_alignment_rule.get("solutions") diff --git a/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py b/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py index 218876db8..f692d52a7 100644 --- a/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py +++ b/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py @@ -17,6 +17,7 @@ import os from typing import List from profiler.advisor.dataset.communication.communication_dataset import CommunicationDataset from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.display.prompt.base_prompt import BasePrompt from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.prof_common.file_manager import FileManager @@ -140,21 +141,12 @@ class BandwidthContentionChecker: """ make record for what and how to optimize """ - language = AdditionalArgsManager().language - - if language == "en": - problem_str = "Bandwidth Contention Analysis" - else: - problem_str = "带宽分析" - - optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) + optimization_item = OptimizeItem(self.problem, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - if language == "en": - sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" - else: - sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + sub_table_name = BasePrompt.get_sub_table_name(self.problem, self.stage) result.add_detail(sub_table_name, headers=self.headers) + for hccl_op in self.abnormal_sdma_list: result.add_detail(sub_table_name, detail=[hccl_op.name, round(hccl_op.dur, 4), round(hccl_op.bandwidth, 2)]) @@ -180,7 +172,8 @@ class BandwidthContentionChecker: ) contention_rule = FileManager.read_yaml_file(contention_rule_path) - self.desc = contention_rule.get("problem") + self.problem = contention_rule.get("problem") + self.desc = contention_rule.get("description") self.threshold = contention_rule.get("threshold", 0) * contention_rule.get("sdma_baseline", 0) self.contention_topk = contention_rule.get("top_num", 3) self.solutions = contention_rule.get("solutions") diff --git a/profiler/advisor/analyzer/communication/packet/packet_checker.py b/profiler/advisor/analyzer/communication/packet/packet_checker.py index 7b556fa8d..332223ebd 100644 --- a/profiler/advisor/analyzer/communication/packet/packet_checker.py +++ b/profiler/advisor/analyzer/communication/packet/packet_checker.py @@ -15,6 +15,7 @@ import logging import os from profiler.advisor.dataset.communication.communication_dataset import CommunicationDataset +from profiler.advisor.display.prompt.base_prompt import BasePrompt from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.prof_common.file_manager import FileManager @@ -110,18 +111,10 @@ class PacketChecker: """ make record for what and how to optimize """ - language = AdditionalArgsManager().language - if language == "en": - problem_str = "Packet analysis" - else: - problem_str = "包分析" - optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) + optimization_item = OptimizeItem(self.problem, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - if language == "en": - sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" - else: - sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + sub_table_name = BasePrompt.get_sub_table_name(self.problem, self.stage) result.add_detail(sub_table_name, headers=self.headers) result.add_detail(sub_table_name, detail=self.small_packet_detail) @@ -147,7 +140,8 @@ class PacketChecker: ) syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) - self.desc = syncbn_rule.get("problem") + self.problem = syncbn_rule.get("problem") + self.desc = syncbn_rule.get("description") self.sdma_desc = syncbn_rule.get("sdma_problem") self.rdma_desc = syncbn_rule.get("rdma_problem") self.min_sdma_size = convert_to_float(syncbn_rule.get("min_sdma_size")) diff --git a/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py index 1dcef3235..8220ac67b 100644 --- a/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py +++ b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py @@ -17,6 +17,7 @@ import os from typing import Dict, List from collections import defaultdict from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset +from profiler.advisor.display.prompt.base_prompt import BasePrompt from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.prof_common.additional_args_manager import AdditionalArgsManager @@ -103,18 +104,10 @@ class CommunicationRetransmissionChecker: """ make record for what and how to optimize """ - language = AdditionalArgsManager().language - if language == "en": - problem_str = "Communication retransmission analysis" - else: - problem_str = "通信重传分析" - optimization_item = OptimizeItem(problem_str, self.desc, self.suggestions) + optimization_item = OptimizeItem(self.problem, self.desc, self.suggestions) result.add(OptimizeRecord(optimization_item)) - if language == "en": - sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" - else: - sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + sub_table_name = BasePrompt.get_sub_table_name(self.problem, self.stage) result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_rdma_list: @@ -141,7 +134,8 @@ class CommunicationRetransmissionChecker: ) syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) - self.desc = syncbn_rule.get("problem") + self.problem = syncbn_rule.get("problem") + self.desc = syncbn_rule.get("description") self.min_retransmission_time = syncbn_rule.get("min_retransmission_time") self.solutions = syncbn_rule.get("solutions") diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py index c8bdea37b..e5e00142f 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py @@ -80,20 +80,6 @@ class AICoreFreqChecker: if not self.ai_core_freq_issues: return - language = AdditionalArgsManager().language - if language == "en": - self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " - f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") - if self.rank: - self.desc = f"For rank {self.rank}, " + self.desc.lower() - self.suggestions = "Please check the temperature or max power of your machine." - else: - self.desc = ( - f"在降频期间发现{len(self.decrease_freq_ops)}个算子,频率降低比例超过了{self.DECREASE_FREQ_RATIO}。") - if self.rank: - self.desc = f"对于{self.rank}号卡," + self.desc - self.suggestions = "请检查您的机器温度或最大功率。" - def make_record(self, result: OptimizeResult): """ make record for what and how to optimize @@ -101,11 +87,21 @@ class AICoreFreqChecker: if not self.ai_core_freq_issues: return self.ai_core_freq_issues - sheet_name = "AI Core Frequency" + language = AdditionalArgsManager().language + if language == "en": + from profiler.advisor.display.prompt.en.ai_core_freq_prompt import AICoreFreqPrompt + else: + from profiler.advisor.display.prompt.cn.ai_core_freq_prompt import AICoreFreqPrompt + + problem = AICoreFreqPrompt.PROBLEM if self.rank is not None: - sheet_name = f"rank {self.rank} AI Core Frequency".capitalize() + problem += AICoreFreqPrompt.RANK_ID.format(self.rank) + + self.desc = AICoreFreqPrompt.DESCRIPTION.format(len(self.decrease_freq_ops), self.DECREASE_FREQ_RATIO) + if self.rank: + self.desc = AICoreFreqPrompt.RANK_DESCRIPTION.format(self.rank) + self.desc.lower() - optimization_item = OptimizeItem(sheet_name, self.desc, [self.suggestions]) + optimization_item = OptimizeItem(problem, self.desc, [AICoreFreqPrompt.SUGGESTION]) result.add(OptimizeRecord(optimization_item)) self.headers = [ @@ -117,10 +113,10 @@ class AICoreFreqChecker: "Max frequency", "Min frequency", ] - result.add_detail(sheet_name, headers=self.headers) + result.add_detail(problem, headers=self.headers) for row in self.decrease_freq_ops: - result.add_detail(sheet_name, detail=row) + result.add_detail(problem, detail=row) return True def make_render(self, html_render, add_render_list=True, **kwargs): diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index eb341d0bf..d0702489c 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -14,6 +14,7 @@ # limitations under the License. import copy import logging +import os from typing import List from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker @@ -21,33 +22,34 @@ from profiler.advisor.config.config import Config from profiler.advisor.dataset.profiling.info_collection import OpInfo from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord from profiler.prof_common.additional_args_manager import AdditionalArgsManager +from profiler.prof_common.file_manager import FileManager logger = logging.getLogger() class DynamicShapeChecker(OperatorChecker): - ENABLE_COMPILED_SUGGESTION = "1. Please try to set environment by execute `export HOST_CACHE_CAPACITY=20`.\n." \ - "2. Please place the following code at the entrance of the python script to disable jit compile.\n " \ - "Code: `torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ - "torch_npu.npu.config.allow_internal_format = False`.\n" - ENABLE_COMPILED_SUGGESTION_CN = "1. 尝试设置环境变量'export HOST_CACHE_CAPACITY=20'。\n" \ - "2. 在python脚本入口加入以下代码关闭在线编译:\n" \ - "'torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ - "torch_npu.npu.config.allow_internal_format = False' \n" - _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION] - _SUGGESTION_CN: List[str] = [ENABLE_COMPILED_SUGGESTION_CN] _CHECKER = "dynamic shape operator" - _PROBLEM = "Dynamic shape operator" - _PROBLEM_CN = "动态shape算子" - _description = f"Found all operators are dynamic shape" - _description_cn = f"找到所有是动态shape的算子" - _op_list: List[OpInfo] = [] _tune_op_list: List[str] = [] # record op name to be tuned, and save to tune_ops_file.cfg _op_views: List = [] def __init__(self, cann_version) -> None: super().__init__(cann_version=cann_version) + self._init_prompt_by_language() + + def _init_prompt_by_language(self): + language = AdditionalArgsManager().language + if language == "en": + from profiler.advisor.display.prompt.en.dynamic_shape_prompt import DynamicShapePrompt + else: + from profiler.advisor.display.prompt.cn.dynamic_shape_prompt import DynamicShapePrompt + + self.rank_id = DynamicShapePrompt.RANK_ID + self._PROBLEM = DynamicShapePrompt.PROBLEM + self._description = DynamicShapePrompt.DESCRIPTION + self.enable_compiled_suggestion = DynamicShapePrompt.ENABLE_COMPILED_SUGGESTION + self._SUGGESTION = [DynamicShapePrompt.ENABLE_COMPILED_SUGGESTION] + self.release_suggestion = DynamicShapePrompt.RELEASE_SUGGESTION def check(self, profiling_data) -> bool: return self.is_dynamic_shape(profiling_data) @@ -56,23 +58,13 @@ class DynamicShapeChecker(OperatorChecker): """ make record for what and how to optimize """ - language = AdditionalArgsManager().language - if language == "en": - if rank is not None: - self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() - optimization_item = OptimizeItem( - self._PROBLEM, - self._description, - self._SUGGESTION - ) - else: - if rank is not None: - self._PROBLEM_CN = f"{rank}号卡 ".capitalize() + self._PROBLEM_CN - optimization_item = OptimizeItem( - self._PROBLEM_CN, - self._description_cn, - self._SUGGESTION_CN - ) + if rank is not None: + self._PROBLEM = self.rank_id + self._PROBLEM.lower() + optimization_item = OptimizeItem( + self._PROBLEM, + self._description, + self._SUGGESTION + ) statistics_item = StatisticsItem("", "", 1) return OptimizeRecord(optimization_item, statistics_item) @@ -87,12 +79,8 @@ class DynamicShapeChecker(OperatorChecker): release_suggestion_list = [] for suggestion in optimization_item.suggestion: release_suggestion = copy.deepcopy(suggestion) - if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION: - release_suggestion += \ - f"for details please refer to link : LINK" - elif release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION_CN: - release_suggestion += \ - f"详细信息请参考:链接" + if release_suggestion == self.enable_compiled_suggestion: + release_suggestion += self.release_suggestion.format(Config().enable_compiled_tune_url) release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} return format_result diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 84215be9a..1be199b82 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -47,26 +47,27 @@ class OperatorChecker(VersionControl): _SUGGESTION_CN: List[str] = [] SKIP_CHECK_MSG = "Skip %s checker because of not containing %s" _tune_op_info_list: List[OpInfo] = [] - PyTorch_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE, such as:\n" \ - f"'aoe --job_type=2 --model_path=$user_dump_path " \ - f"--tune_ops_file={Config().tune_ops_file}'\n" - PyTorch_OPERATOR_TUNE_SUGGESTION_CN = f"通过AOE优化算子,使用样例如下:\n" \ - f"'aoe --job_type=2 --model_path=$user_dump_path " \ - f"--tune_ops_file={Config().tune_ops_file}'\n" - MSLite_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \ - f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ - f"--modelFile=$user_model.onnx --outputFile=user_model " \ - f"--configFile=./config.txt\n" - MSLite_OPERATOR_TUNE_SUGGESTION_CN = f"在Mindpore Lite 框架通过AOE优化算子,使用样例如下:\n" \ - f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ - f"--modelFile=$user_model.onnx --outputFile=user_model " \ - f"--configFile=./config.txt\n" def __init__(self, cann_version: str): self.cann_version = cann_version self._op_list: List[OpInfo] = [] self._tune_op_list: List[str] = [] + self._init_prompt_by_language() + + def _init_prompt_by_language(self): + language = AdditionalArgsManager().language + if language == "en": + from profiler.advisor.display.prompt.en.operator_prompt import OperatorPrompt + else: + from profiler.advisor.display.prompt.cn.operator_prompt import OperatorPrompt + + self.rank_id = OperatorPrompt.RANK_ID + self.pytorch_op_tune_suggestion = OperatorPrompt.PYTORCH_OPERATOR_TUNE_SUGGESTION + self.mslite_op_tune_suggestion = OperatorPrompt.MSLITE_OPERATOR_TUNE_SUGGESTION + self.pytorch_release_suggestion = OperatorPrompt.PYTORCH_RELEASE_SUGGESTION + self.mslite_release_suggestion = OperatorPrompt.MSLITE_RELEASE_SUGGESTION + @staticmethod def get_ratio(op_info: OpInfo, attr: str) -> float: if not op_info.has_attr(attr): @@ -127,12 +128,8 @@ class OperatorChecker(VersionControl): :param profiling_data: profiling data :return: optimize record """ - language = AdditionalArgsManager().language if rank is not None: - if language == "en": - self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() - else: - self._PROBLEM_CN = f"{rank}号卡".capitalize() + self._PROBLEM_CN + self._PROBLEM = self.rank_id.format(rank) + self._PROBLEM.lower() task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list @@ -141,18 +138,12 @@ class OperatorChecker(VersionControl): total_task_duration = profiling_data.op_summary.get_total_task_duration() count = len(task_duration_list) statistics_item = StatisticsItem(total_task_duration, total_cost_time, count, self.get_incomes()) - if language == "en": - optimization_item = OptimizeItem( - self._PROBLEM, - self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), - self._SUGGESTION - ) - else: - optimization_item = OptimizeItem( - self._PROBLEM_CN, - self._get_description(self._description_cn, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), - self._SUGGESTION_CN - ) + + optimization_item = OptimizeItem( + self._PROBLEM, + self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), + self._SUGGESTION + ) return OptimizeRecord(optimization_item, statistics_item) @@ -218,28 +209,12 @@ class OperatorChecker(VersionControl): release_suggestion_list = [] for suggestion in optimization_item.suggestion: release_suggestion = copy.deepcopy(suggestion) - if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION: - release_suggestion += \ - (f"for details please refer to link : LINK") - elif release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION_CN: - release_suggestion += \ - (f"详细信息请参考:链接") - elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION: - release_suggestion += \ - (f"\nThe config file for MSLite AOE usage is as follows:\n" \ - f"[ascend_context]\n" \ - f"aoe_mode=\"operator tuning\"\n" \ - f"--tune_ops_file={Config().tune_ops_file}\n" - f"\nFor details please refer to link : LINK") - elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION_CN: - release_suggestion += \ - (f"\nMSLite AOE的配置文件如下usage:\n" \ - f"[ascend_context]\n" \ - f"aoe_mode=\"operator tuning\"\n" \ - f"--tune_ops_file={Config().tune_ops_file}\n" - f"\n详细信息请参考:链接") + if release_suggestion == self.pytorch_op_tune_suggestion: + release_suggestion += (self.pytorch_release_suggestion.format(Config().pytorch_aoe_operator_tune_url)) + elif release_suggestion == self.mslite_op_tune_suggestion: + release_suggestion += (self.mslite_release_suggestion.format( + Config().tune_ops_file, Config().mslite_infer_aoe_operator_tune_url)) + release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = { "record": record.__dict__, @@ -356,15 +331,9 @@ class OperatorChecker(VersionControl): def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: language = AdditionalArgsManager().language if profiling_data.PROF_TYPE == EnumParamsParser().profiling_type.ascend_pytorch_profiler: - if language == "en": - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) - else: - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION_CN) + self._SUGGESTION.append(self.pytorch_op_tune_suggestion) elif profiling_data.PROF_TYPE == EnumParamsParser.profiling_type.mslite: - if language == "en": - self._SUGGESTION_CN.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) - else: - self._SUGGESTION_CN.append(self.MSLite_OPERATOR_TUNE_SUGGESTION_CN) + self._SUGGESTION.append(self.mslite_op_tune_suggestion) def _check_data(self, profiling_data): return True diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py index 96505cb6e..7c9218c4a 100644 --- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py @@ -183,17 +183,15 @@ class GraphFusionRules: language = AdditionalArgsManager().language if language == "en": - optimization_item = OptimizeItem( - "fusion issue", - f"Found {len(self.candidates)} fusion issues", - ["Check fusion issues detail in mstt_advisor*.html"] - ) + from profiler.advisor.display.prompt.cn.graph_fusion_prompt import GraphFusionPrompt else: - optimization_item = OptimizeItem( - "融合问题", - f"发现 {len(self.candidates)} 个融合问题", - ["在mstt_advisor*.html中查看融合问题的细节信息"] - ) + from profiler.advisor.display.prompt.en.graph_fusion_prompt import GraphFusionPrompt + + optimization_item = OptimizeItem( + GraphFusionPrompt.PRIBLEM, + GraphFusionPrompt.DESCRIPTION.format(len(self.candidates)), + [GraphFusionPrompt.SUGGESTION] + ) total_time = 0.0 for candidate in self.task_duration_list: diff --git a/profiler/advisor/analyzer/overall/environment_variable_checker.py b/profiler/advisor/analyzer/overall/environment_variable_checker.py index dbacdf183..31323c6f3 100644 --- a/profiler/advisor/analyzer/overall/environment_variable_checker.py +++ b/profiler/advisor/analyzer/overall/environment_variable_checker.py @@ -36,11 +36,7 @@ class EnvironmentVariabelChecker: "ASCEND_LAUNCH_BLOCKING": lambda x: convert_to_int(x) != 1, } - language = AdditionalArgsManager().language - if language == "en": - HEADERS = ["Environment", "Value", "Description", "Suggestion"] - else: - HEADERS = ["环境变量", "值", "描述", "建议"] + HEADERS = ["Environment", "Value", "Description", "Suggestion"] def __init__(self): self.environment_info = self.read_environment_info() @@ -85,24 +81,22 @@ class EnvironmentVariabelChecker: def make_record(self, result: OptimizeResult): if not self.env_suggest_csv: return + language = AdditionalArgsManager().language if language == "en": - desc = f"Describe and suggest the optimal environment variable settings" - suggestion = "Please set the optimal environment variable" + from profiler.advisor.display.prompt.en.environment_variable_prompt import EnvironmentVariablePrompt else: - desc = f"描述并给出最优的环境变量配置建议" - suggestion = "请设置最优的环境变量" - + from profiler.advisor.display.prompt.cn.environment_variable_prompt import EnvironmentVariablePrompt optimization_item = OptimizeItem( - SupportedScopes.ENVIRONMENT_VARIABLE_ANALYSIS, - desc, - [suggestion] + EnvironmentVariablePrompt.PRIBLEM, + EnvironmentVariablePrompt.DESCRIPTION, + [EnvironmentVariablePrompt.SUGGESTION] ) result.add(OptimizeRecord(optimization_item)) - result.add_detail(SupportedScopes.ENVIRONMENT_VARIABLE_ANALYSIS, headers=self.HEADERS) + result.add_detail(EnvironmentVariablePrompt.PRIBLEM, headers=self.HEADERS) for env_suggest in self.env_suggest_csv: - result.add_detail(SupportedScopes.ENVIRONMENT_VARIABLE_ANALYSIS, detail=env_suggest) + result.add_detail(EnvironmentVariablePrompt.PRIBLEM, detail=env_suggest) def make_render(self, html_render: HTMLRender): if not self.env_suggest_html: diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index d8e3b5701..7c85f200d 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -42,17 +42,19 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.bottleneck_str = "" self.over_summary_analysis = {} - language = AdditionalArgsManager + self._init_prompt_by_language() + + def _init_prompt_by_language(self): + language = AdditionalArgsManager().language if language == "en": - self.over_summary_analyzer = Constant.OVERALL_SUMMARY_ANALYZER - self.advice_map = Constant.ANALYZER_ADVICE_MAP - self.time_name_map = Constant.TIME_NAME_MAP - self.performance_time_dict = Constant.ANALYZER_PERFORMANCE_TIME_DICT + from profiler.advisor.display.prompt.en.overall_summary_advice_prompt import OverallSummaryAnalyzePrompt else: - self.over_summary_analyzer = Constant.OVERALL_SUMMARY_ANALYZER_CN - self.advice_map = Constant.ANALYZER_ADVICE_MAP_CN - self.time_name_map = Constant.TIME_NAME_MAP_CN - self.performance_time_dict = Constant.ANALYZER_PERFORMANCE_TIME_DICT_CN + from profiler.advisor.display.prompt.cn.overall_summary_advice_prompt import OverallSummaryAnalyzePrompt + + self.over_summary_analyzer = OverallSummaryAnalyzePrompt.OVERALL_SUMMARY_ANALYZER + self.advice_map = OverallSummaryAnalyzePrompt.PERFORMANCE_TIME_DICT + self.time_name_map = OverallSummaryAnalyzePrompt.TIME_NAME_MAP + self.performance_time_dict = OverallSummaryAnalyzePrompt.PERFORMANCE_TIME_DICT @staticmethod def calculate_ratio(dividend, divisor): @@ -212,18 +214,18 @@ class OverallSummaryAnalyzer(BaseAnalyzer): if not self.bottleneck_str and not self.cur_advices: return optimization_item = OptimizeItem( - OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + self.over_summary_analyzer, self.bottleneck_str, self.cur_advices ) self.result.add(OptimizeRecord(optimization_item)) self.result.add_detail( - OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + self.over_summary_analyzer, headers=self.over_summary_analysis["headers"] ) for data in self.over_summary_analysis["data"]: - self.result.add_detail(OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, detail=data) + self.result.add_detail(self.over_summary_analyzer, detail=data) def make_render(self): if not self.bottleneck_str and not self.cur_advices: @@ -236,7 +238,7 @@ class OverallSummaryAnalyzer(BaseAnalyzer): "details": [self.over_summary_analysis] } self.html_render.render_template(key="overall", - title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + title=self.over_summary_analyzer, template_dir="templates", template_name="cluster_analysis.html", cann_version=self.cann_version, diff --git a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py index 71b44dd99..1e26fa691 100644 --- a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py +++ b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -80,31 +80,24 @@ class OpDispatchAnalyzer(BaseAnalyzer): """ if not self._op_compile or len(self._issues_record) <= 0: return + language = AdditionalArgsManager().language if language == "en": - desc = f"Found {self._op_compile.total_count} operator compile issues." - suggestion = ("Please place the following code at the entrance of the python script to disable jit compile. \n" \ - "Code: `torch_npu.npu.set_compile_mode(jit_compile=False) \n" - "torch_npu.npu.config.allow_internal_format = False` \n") - problem_str = "Operator dispatch" + from profiler.advisor.display.prompt.en.timeline_op_dispatch_prompt import TimelineOpDispatchPrompt else: - desc = f"发现{self._op_compile.total_count}个算子编译问题。" - suggestion = ( - "请在python脚本入口添加以下代码关闭在线编译:\n" \ - "'torch_npu.npu.set_compile_mode(jit_compile=False) \n" - "torch_npu.npu.config.allow_internal_format = False' \n") - problem_str = "算子下发" - self.optimization_item.append(OptimizeItem(problem_str, desc, [suggestion])) + from profiler.advisor.display.prompt.cn.timeline_op_dispatch_prompt import TimelineOpDispatchPrompt + + self.optimization_item.append(OptimizeItem( + TimelineOpDispatchPrompt.PRIBLEM, + TimelineOpDispatchPrompt.DESCRIPTION.format(self._op_compile.total_count), + [TimelineOpDispatchPrompt.SUGGESTION])) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - if language == "en": - record_title = ["Issues", "op name", "counts", "total time"] - else: - record_title = ["问题", "算子名称", "数量", "总时长"] - result.add_detail(problem_str, headers=record_title) + record_title = ["Issues", "op name", "counts", "total time"] + result.add_detail(TimelineOpDispatchPrompt.PRIBLEM, headers=record_title) for op_info in self._issues_record: - result.add_detail(problem_str, detail=op_info) + result.add_detail(TimelineOpDispatchPrompt.PRIBLEM, detail=op_info) def make_render(self, html_render, **kwargs): issues = [] diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index 6b549ad0c..3a2ad9c7a 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -97,49 +97,33 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): return language = AdditionalArgsManager().language if language == "en": - desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \ - f" based on the runtime env cann-{self.cann_version} and torch-{self.torch_version}" - - suggestion = "Please replace training api according to sub table 'Affinity training api'" - if self.empty_stacks: - desc += ", but with no stack" - suggestion = Constant.TIMELINE_EMPTY_STACKS_PROMPT.format( - timeline_profiling_doc_url=Config().timeline_with_stack_doc_url - ) - sheet_name = "Affinity apis" + from profiler.advisor.display.prompt.en.fusion_ops_prompt import FusionOpsPrompt else: - desc = f"目前运行环境版本为cann-{self.cann_version}和torch-{self.torch_version}," \ - f"发现有{len(format_timeline_result(self.matched_op_stacks))}个api接口可以替换。" - suggestion = "请根据子表'Affinity training api'替换训练api接口" - if self.empty_stacks: - desc += ",但没有堆栈" - suggestion = Constant.TIMELINE_EMPTY_STACKS_PROMPT_CN.format( - timeline_profiling_doc_url=Config().timeline_with_stack_doc_url - ) - sheet_name = "亲和API接口" - - optimization_item = OptimizeItem( - sheet_name, - desc, - [suggestion] - ) + from profiler.advisor.display.prompt.cn.fusion_ops_prompt import FusionOpsPrompt - self.result.add(OptimizeRecord(optimization_item)) + desc = FusionOpsPrompt.DESCRIPTION.format(self.cann_version, self.torch_version, + len(format_timeline_result(self.matched_op_stacks))) + suggestion = FusionOpsPrompt.SUGGESTION + if self.empty_stacks: + desc += FusionOpsPrompt.EMPTY_STACK_DESCRIPTION + suggestion = FusionOpsPrompt.EMPTY_STACKS_SUGGESTION.format( + timeline_profiling_doc_url=Config().timeline_with_stack_doc_url + ) - if language == "en": - record_title = ["Affinity API", "Code stacks", "Stack called counts"] - else: - record_title = ["亲和API接口", "代码堆栈", "堆栈调用数量"] - self.result.add_detail(sheet_name, headers=record_title) + optimization_item = OptimizeItem(FusionOpsPrompt.PROBLEM, desc, [suggestion]) + + self.result.add(OptimizeRecord(optimization_item)) + record_title = ["Affinity API", "Code stacks", "Stack called counts"] + self.result.add_detail(FusionOpsPrompt.PROBLEM, headers=record_title) for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items(): if not stacks_info: detail = [api_name, "null", "null"] - self.result.add_detail(sheet_name, detail=detail) + self.result.add_detail(FusionOpsPrompt.PROBLEM, detail=detail) else: for stack in stacks_info: detail = [api_name, *stack] - self.result.add_detail(sheet_name, detail=detail) + self.result.add_detail(FusionOpsPrompt.PROBLEM, detail=detail) def make_render(self, **kwargs): rank = kwargs.get("rank") diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py index 453f9eb5d..eb7dea746 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_checker.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py @@ -17,6 +17,7 @@ import math import os from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset +from profiler.advisor.display.prompt.base_prompt import BasePrompt from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.utils.utils import convert_to_float, convert_to_int, safe_division @@ -85,25 +86,14 @@ class GcChecker: if not self.gc_issues: return - language = AdditionalArgsManager().language - if language == "en": - problem_str = "GC Analysis" - else: - problem_str = "GC分析" - - self.optimization_item.append(OptimizeItem(problem_str, self.desc, self.suggestions)) + self.optimization_item.append(OptimizeItem(self.problem, self.desc, self.suggestions)) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - if language == "en": - if self.rank is not None: - self.headers = ["Rank id"] + self.headers - sub_table_name = problem_str if not self.stage else f"Stage-{self.stage}: {problem_str}" - else: - if self.rank is not None: - self.headers = ["卡号"] + self.headers - sub_table_name = problem_str if not self.stage else f"阶段-{self.stage}:{problem_str}" + if self.rank is not None: + self.headers = ["Rank id"] + self.headers + sub_table_name = BasePrompt.get_sub_table_name(self.problem, self.stage) result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_gc_list: @@ -188,6 +178,7 @@ class GcChecker: gc_rule = FileManager.read_yaml_file(gc_rule_path) + self.problem = gc_rule.get("problem") self.gc_threshold = convert_to_float(gc_rule.get("gc_threshold", 0)) self.gc_topk_num = convert_to_int(gc_rule.get("top_num", 0)) self.gc_problem_with_count = gc_rule.get("gc_problem_with_count", "") diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index 368869305..369c3f877 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -20,52 +20,26 @@ class SupportedScopes: # used for specify fourth-level commands and define the key of the result dict # the key defined bellow must be the same as value - language = AdditionalArgsManager().language - if language == "en": - TIMELINE_FUSION_OPS = "timeline_fusion_ops" - GRAPH = "graph" - SLOW_RANK = "slow_rank" - SLOW_LINK = "slow_link" - COMMUNICATION_RETRANSMISSION_DETECTION = "communication_retransmission_analysis" - PACKET = "packet_analysis" - BANDWIDTH_CONTENTION_DETECTION = "bandwidth_contention_analysis" - BYTE_ALIGNMENT_DETECTION = "byte_alignment_analysis" - OVER_ALL = "over_all" - ENVIRONMENT_VARIABLE_ANALYSIS = "environment_variable_analysis" - DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" - AICPU_ANALYSIS = "aicpu_analysis" - BLOCK_DIM_ANALYSIS = "block_dim_analysis" - OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" - TIMELINE_OP_DISPATCH = "timeline_op_dispatch" - DATALOADER = "dataloader" - SYNCBN = "syncbn" - SYNCHRONIZE_STREAM = "synchronize_stream" - FREQ_ANALYSIS = "freq_analysis" - MEMORY = "memory" - STAGE_COMPUTE = "stage_compute" - GC_ANALYSIS = "gc_analysis" - COMPARISON = "comparison" - else: - TIMELINE_FUSION_OPS = "融合算子" - GRAPH = "图" - SLOW_RANK = "慢节点" - SLOW_LINK = "慢链路" - COMMUNICATION_RETRANSMISSION_DETECTION = "通信重传分析" - PACKET = "包分析" - BANDWIDTH_CONTENTION_DETECTION = "带宽限制分析" - BYTE_ALIGNMENT_DETECTION = "字节对齐分析" - OVER_ALL = "总览" - ENVIRONMENT_VARIABLE_ANALYSIS = "环境变量分析" - DYNAMIC_SHAPE_ANALYSIS = "动态shape分析" - AICPU_ANALYSIS = "aicpu分析" - BLOCK_DIM_ANALYSIS = "AICore核数分析" - OPERATOR_NO_BOUND_ANALYSIS = "算子瓶颈分析" - TIMELINE_OP_DISPATCH = "调度" - DATALOADER = "数据加载" - SYNCBN = "batchnorm同步" - SYNCHRONIZE_STREAM = "流同步" - FREQ_ANALYSIS = "频率分析" - MEMORY = "内存" - STAGE_COMPUTE = "阶段计算" - GC_ANALYSIS = "gc分析" - COMPARISON = "对比" + TIMELINE_FUSION_OPS = "timeline_fusion_ops" + GRAPH = "graph" + SLOW_RANK = "slow_rank" + SLOW_LINK = "slow_link" + COMMUNICATION_RETRANSMISSION_DETECTION = "communication_retransmission_analysis" + PACKET = "packet_analysis" + BANDWIDTH_CONTENTION_DETECTION = "bandwidth_contention_analysis" + BYTE_ALIGNMENT_DETECTION = "byte_alignment_analysis" + OVER_ALL = "over_all" + ENVIRONMENT_VARIABLE_ANALYSIS = "environment_variable_analysis" + DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" + AICPU_ANALYSIS = "aicpu_analysis" + BLOCK_DIM_ANALYSIS = "block_dim_analysis" + OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" + TIMELINE_OP_DISPATCH = "timeline_op_dispatch" + DATALOADER = "dataloader" + SYNCBN = "syncbn" + SYNCHRONIZE_STREAM = "synchronize_stream" + FREQ_ANALYSIS = "freq_analysis" + MEMORY = "memory" + STAGE_COMPUTE = "stage_compute" + GC_ANALYSIS = "gc_analysis" + COMPARISON = "comparison" \ No newline at end of file diff --git a/profiler/advisor/display/prompt/__init__.py b/profiler/advisor/display/prompt/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/profiler/advisor/display/prompt/base_prompt.py b/profiler/advisor/display/prompt/base_prompt.py new file mode 100644 index 000000000..b3b4751ca --- /dev/null +++ b/profiler/advisor/display/prompt/base_prompt.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from profiler.prof_common.additional_args_manager import AdditionalArgsManager + + +class BasePrompt: + @staticmethod + def get_sub_table_name(problem, stage): + language = AdditionalArgsManager().language + if language == "en": + sub_table_name = problem if not stage else f"Stage-{stage}: {problem}" + else: + sub_table_name = problem if not stage else f"阶段-{stage}:{problem}" + return sub_table_name diff --git a/profiler/advisor/display/prompt/cn/__init__.py b/profiler/advisor/display/prompt/cn/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/profiler/advisor/display/prompt/cn/ai_core_freq_prompt.py b/profiler/advisor/display/prompt/cn/ai_core_freq_prompt.py new file mode 100644 index 000000000..9a00926bf --- /dev/null +++ b/profiler/advisor/display/prompt/cn/ai_core_freq_prompt.py @@ -0,0 +1,21 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class AICoreFreqPrompt(object): + RANK_ID = "{}号卡" + PROBLEM = "AIcore频率" + DESCRIPTION = "在降频期间发现{}个算子,频率降低比例超过了{}。" + RANK_DESCRIPTION = "对于{}号卡," + SUGGESTION = "请检查您的机器温度或最大功率。" \ No newline at end of file diff --git a/profiler/advisor/display/prompt/cn/dynamic_shape_prompt.py b/profiler/advisor/display/prompt/cn/dynamic_shape_prompt.py new file mode 100644 index 000000000..79c2e1ad2 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/dynamic_shape_prompt.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class DynamicShapePrompt(object): + RANK_ID = "{}号卡" + PROBLEM = "动态shape算子" + DESCRIPTION = f"找到所有是动态shape的算子" + ENABLE_COMPILED_SUGGESTION = "1. 尝试设置环境变量'export HOST_CACHE_CAPACITY=20'。\n" \ + "2. 在python脚本入口加入以下代码关闭在线编译:\n" \ + "'torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ + "torch_npu.npu.config.allow_internal_format = False' \n" + RELEASE_SUGGESTION = "详细信息请参考:链接" \ No newline at end of file diff --git a/profiler/advisor/display/prompt/cn/environment_variable_prompt.py b/profiler/advisor/display/prompt/cn/environment_variable_prompt.py new file mode 100644 index 000000000..5536af0d7 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/environment_variable_prompt.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class EnvironmentVariablePrompt(object): + PRIBLEM = "环境变量分析" + DESCRIPTION = "描述并给出最优的环境变量配置建议" + SUGGESTION = "请设置最优的环境变量" diff --git a/profiler/advisor/display/prompt/cn/fusion_ops_prompt.py b/profiler/advisor/display/prompt/cn/fusion_ops_prompt.py new file mode 100644 index 000000000..0113cb58a --- /dev/null +++ b/profiler/advisor/display/prompt/cn/fusion_ops_prompt.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class FusionOpsPrompt(object): + PROBLEM = "亲和API接口" + DESCRIPTION = "目前运行环境版本为cann-{}和torch-{},发现有{}个api接口可以替换。" + SUGGESTION = "请根据子表'Affinity training api'替换训练api接口" + EMPTY_STACK_DESCRIPTION = ",但没有堆栈" + EMPTY_STACKS_SUGGESTION = "这些API接口没有代码堆栈。如果采集profiling时参数为'with_stack=False'," \ + "请参考{}设置'with_stack=True'。" \ + "另外,由于反向传播没有堆栈,请忽略以下亲和APIs。" diff --git a/profiler/advisor/display/prompt/cn/graph_fusion_prompt.py b/profiler/advisor/display/prompt/cn/graph_fusion_prompt.py new file mode 100644 index 000000000..4366a0947 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/graph_fusion_prompt.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class GraphFusionPrompt(object): + PRIBLEM = "融合问题" + DESCRIPTION = "发现 {} 个融合问题" + SUGGESTION = "在mstt_advisor*.html中查看融合问题的细节信息" \ No newline at end of file diff --git a/profiler/advisor/display/prompt/cn/operator_prompt.py b/profiler/advisor/display/prompt/cn/operator_prompt.py new file mode 100644 index 000000000..4eb09da55 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/operator_prompt.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OperatorPrompt(object): + RANK_ID = "{}号卡" + PYTORCH_OPERATOR_TUNE_SUGGESTION = "通过AOE优化算子,使用样例如下:\n" \ + "'aoe --job_type=2 --model_path=$user_dump_path " \ + "--tune_ops_file={}'\n" + MSLITE_OPERATOR_TUNE_SUGGESTION = f"在Mindpore Lite 框架通过AOE优化算子,使用样例如下:\n" \ + f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ + f"--modelFile=$user_model.onnx --outputFile=user_model " \ + f"--configFile=./config.txt\n" + PYTORCH_RELEASE_SUGGESTION = "详细信息请参考:链接" + MSLITE_RELEASE_SUGGESTION = "\nMSLite AOE的配置文件如下usage:\n" \ + "[ascend_context]\n" \ + "aoe_mode=\"operator tuning\"\n" \ + "--tune_ops_file={}\n" \ + "\n详细信息请参考:链接" \ No newline at end of file diff --git a/profiler/advisor/display/prompt/cn/overall_summary_advice_prompt.py b/profiler/advisor/display/prompt/cn/overall_summary_advice_prompt.py new file mode 100644 index 000000000..2f41edad0 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/overall_summary_advice_prompt.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OverallSummaryAdvicePrompt(object): + ADVICE_MAP = { + "计算时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor computation.", + "未被掩盖的通信时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule.", + "空闲时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule." + } + TIME_NAME_MAP = { + "计算时长": "computing", + "未被掩盖的通信时长": "communication", + "空闲时长": "free", + 'Cube算子时长(数量)': 'Cube Time', + 'Vector算子时长(数量)': 'Vector Time', + 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', + 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', + '其它时长': "Other Computing Time", + 'SDMA时长(数量)': 'SDMA Time' + } + PERFORMANCE_TIME_DICT = { + "计算时长": ['Cube时长(数量)', 'Vector时长(数量)', 'Flash Attention时长(前向)(数量)', + 'Flash Attention时长(反向)(数量)', '其它时长'], + "未被掩盖的通信时长(等待时长)": [], + "空闲时长": ['SDMA Time(Num)'] + } \ No newline at end of file diff --git a/profiler/advisor/display/prompt/cn/overall_summary_analyzer_prompt.py b/profiler/advisor/display/prompt/cn/overall_summary_analyzer_prompt.py new file mode 100644 index 000000000..7690e7541 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/overall_summary_analyzer_prompt.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OverallSummaryAnalyzePrompt(object): + OVERALL_SUMMARY_ANALYZER = "整网耗时分析" + ADVICE_MAP = { + "计算时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", + "未被掩盖的通信时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", + "空闲时长": "如果你想了解更多详细建议请看mstt_advisor_*.html" + } + TIME_NAME_MAP = { + "计算时长": "computing", + "未被掩盖的通信时长": "communication", + "空闲时长": "free", + 'Cube算子时长(数量)': 'Cube Time', + 'Vector算子时长(数量)': 'Vector Time', + 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', + 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', + '其它时长': "Other Computing Time", + 'SDMA时长(数量)': 'SDMA Time' + } + PERFORMANCE_TIME_DICT = { + "计算时长": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- 其它Cube": "other_cube_time_ms", + "未被掩盖的通信时长": "uncovered_communication_time_ms", + " -- 等待时长": "wait_time_ms", + " -- 传输时长": "transmit_time_ms", + "空闲时长": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- 空闲时长": "free_ms", + "E2E时长": "e2e_time_ms" + } diff --git a/profiler/advisor/display/prompt/cn/timeline_op_dispatch_prompt.py b/profiler/advisor/display/prompt/cn/timeline_op_dispatch_prompt.py new file mode 100644 index 000000000..0933c2914 --- /dev/null +++ b/profiler/advisor/display/prompt/cn/timeline_op_dispatch_prompt.py @@ -0,0 +1,21 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class TimelineOpDispatchPrompt(object): + PRIBLEM = "算子下发" + DESCRIPTION = "发现{}个算子编译问题。" + SUGGESTION = "请在python脚本入口添加以下代码关闭在线编译:\n" \ + "'torch_npu.npu.set_compile_mode(jit_compile=False) \n" \ + "torch_npu.npu.config.allow_internal_format = False' \n" diff --git a/profiler/advisor/display/prompt/en/__init__.py b/profiler/advisor/display/prompt/en/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py b/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py new file mode 100644 index 000000000..62a9507c0 --- /dev/null +++ b/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class AICoreFreqPrompt(object): + RANK_ID = "RANK {} " + PROBLEM = "AI Core Frequency" + DESCRIPTION = "{} operators are found during frequency reduction, and the reduction " \ + "ratio is larger than {}." + RANK_DESCRIPTION = "For rank {}, " + SUGGESTION = "Please check the temperature or max power of your machine." + + +if language == "en": + self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " + f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") + if self.rank: + self.desc = f"For rank {self.rank}, " + self.desc.lower() + self.suggestions = "Please check the temperature or max power of your machine." +else: + self.desc = ( + f"在降频期间发现{len(self.decrease_freq_ops)}个算子,频率降低比例超过了{self.DECREASE_FREQ_RATIO}。") + if self.rank: + self.desc = f"对于{self.rank}号卡," + self.desc + self.suggestions = "请检查您的机器温度或最大功率。" \ No newline at end of file diff --git a/profiler/advisor/display/prompt/en/dynamic_shape_prompt.py b/profiler/advisor/display/prompt/en/dynamic_shape_prompt.py new file mode 100644 index 000000000..b350f603f --- /dev/null +++ b/profiler/advisor/display/prompt/en/dynamic_shape_prompt.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class DynamicShapePrompt(object): + RANK_ID = "RANK {} " + PROBLEM = "Dynamic Shape Operator" + DESCRIPTION = "Found all operators are dynamic shape" + ENABLE_COMPILED_SUGGESTION = "1. Please try to set environment by execute `export HOST_CACHE_CAPACITY=20`.\n." \ + "2. Please place the following code at the entrance of the python script to disable jit compile.\n " \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ + "torch_npu.npu.config.allow_internal_format = False`.\n" + RELEASE_SUGGESTION = "for details please refer to link : LINK" diff --git a/profiler/advisor/display/prompt/en/environment_variable_prompt.py b/profiler/advisor/display/prompt/en/environment_variable_prompt.py new file mode 100644 index 000000000..fb9f2d264 --- /dev/null +++ b/profiler/advisor/display/prompt/en/environment_variable_prompt.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class EnvironmentVariablePrompt(object): + PRIBLEM = "Environment Variable Analysis" + DESCRIPTION = "Describe and suggest the optimal environment variable settings" + SUGGESTION = "Please set the optimal environment variable" diff --git a/profiler/advisor/display/prompt/en/fusion_ops_prompt.py b/profiler/advisor/display/prompt/en/fusion_ops_prompt.py new file mode 100644 index 000000000..e02f2f29a --- /dev/null +++ b/profiler/advisor/display/prompt/en/fusion_ops_prompt.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class FusionOpsPrompt(object): + PROBLEM = "Affinity Apis" + DESCRIPTION = "On the runtime env cann-{} and torch-{}, found {} apis to be replaced" + SUGGESTION = "Please replace training api according to sub table 'Affinity training api'" + EMPTY_STACK_DESCRIPTION = ", but with no stack" + EMPTY_STACKS_SUGGESTION = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ + "please refer to {} to set 'with_stack=True'. " \ + "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." diff --git a/profiler/advisor/display/prompt/en/graph_fusion_prompt.py b/profiler/advisor/display/prompt/en/graph_fusion_prompt.py new file mode 100644 index 000000000..1a2696f5d --- /dev/null +++ b/profiler/advisor/display/prompt/en/graph_fusion_prompt.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class GraphFusionPrompt(object): + PRIBLEM = "Fusion Issue" + DESCRIPTION = "Found {} fusion issues" + SUGGESTION = "Check fusion issues detail in mstt_advisor*.html" diff --git a/profiler/advisor/display/prompt/en/operator_prompt.py b/profiler/advisor/display/prompt/en/operator_prompt.py new file mode 100644 index 000000000..0491cefa9 --- /dev/null +++ b/profiler/advisor/display/prompt/en/operator_prompt.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OperatorPrompt(object): + RANK_ID = "RANK {} " + PYTORCH_OPERATOR_TUNE_SUGGESTION = "Optimize operator by AOE, such as:\n" \ + "'aoe --job_type=2 --model_path=$user_dump_path " \ + "--tune_ops_file={}'\n" + MSLITE_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \ + f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ + f"--modelFile=$user_model.onnx --outputFile=user_model " \ + f"--configFile=./config.txt\n" + PYTORCH_RELEASE_SUGGESTION = "for details please refer to link : LINK" + MSLITE_RELEASE_SUGGESTION = "\nThe config file for MSLite AOE usage is as follows:\n" \ + "[ascend_context]\n" \ + "aoe_mode=\"operator tuning\"\n" \ + "--tune_ops_file={}\n" \ + "\nFor details please refer to link : LINK" diff --git a/profiler/advisor/display/prompt/en/overall_summary_advice_prompt.py b/profiler/advisor/display/prompt/en/overall_summary_advice_prompt.py new file mode 100644 index 000000000..05bbd6215 --- /dev/null +++ b/profiler/advisor/display/prompt/en/overall_summary_advice_prompt.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OverallSummaryAdvicePrompt(object): + ADVICE_MAP = { + "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", + "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", + "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." + } + TIME_NAME_MAP = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + PERFORMANCE_TIME_DICT = { + "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', + 'Flash Attention Time(Backward)(Num)', 'Other Time'], + "Uncovered Communication Time(Wait Time)": [], + "Free Time": ['SDMA Time(Num)'] + } \ No newline at end of file diff --git a/profiler/advisor/display/prompt/en/overall_summary_analyzer_prompt.py b/profiler/advisor/display/prompt/en/overall_summary_analyzer_prompt.py new file mode 100644 index 000000000..4d3f17261 --- /dev/null +++ b/profiler/advisor/display/prompt/en/overall_summary_analyzer_prompt.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OverallSummaryAnalyzePrompt(object): + OVERALL_SUMMARY_ANALYZER = "Overall Summary" + ADVICE_MAP = { + "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", + "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" + } + TIME_NAME_MAP = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + PERFORMANCE_TIME_DICT = { + "Computing Time": "computing_time_ms", + " -- Flash Attention": "fa_time_ms", + " -- Conv": "conv_time_ms", + " -- Matmul": "matmul_time_ms", + " -- Vector": "vector_time_ms", + " -- SDMA(Tensor Move)": "tensor_move_time_ms", + " -- Other Cube": "other_cube_time_ms", + "Uncovered Communication Time": "uncovered_communication_time_ms", + " -- Wait": "wait_time_ms", + " -- Transmit": "transmit_time_ms", + "Free Time": "free_time_ms", + " -- SDMA": "sdma_time_ms", + " -- Free": "free_ms", + "E2E Time": "e2e_time_ms" + } diff --git a/profiler/advisor/display/prompt/en/timeline_op_dispatch_prompt.py b/profiler/advisor/display/prompt/en/timeline_op_dispatch_prompt.py new file mode 100644 index 000000000..e52a22cb7 --- /dev/null +++ b/profiler/advisor/display/prompt/en/timeline_op_dispatch_prompt.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class TimelineOpDispatchPrompt(object): + PRIBLEM = "Operator Dispatch" + DESCRIPTION = "Found {} operator compile issues." + SUGGESTION = "Please place the following code at the entrance of the python script to disable jit compile. \n" \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False) \n" \ + "torch_npu.npu.config.allow_internal_format = False` \n" + diff --git a/profiler/advisor/rules/cn/bandwidth_contention.yaml b/profiler/advisor/rules/cn/bandwidth_contention.yaml index 52a85b5ab..b6c24f1ac 100644 --- a/profiler/advisor/rules/cn/bandwidth_contention.yaml +++ b/profiler/advisor/rules/cn/bandwidth_contention.yaml @@ -1,4 +1,5 @@ -problem: "在执行计算和通信任务时,SDMA带宽低于 {threshold}GB/s。通常,并行计算和通信可以提高模型的运行效率。并发计算和通信任务可能会影响通信带宽。" +problem: "带宽分析" +description: "在执行计算和通信任务时,SDMA带宽低于 {threshold}GB/s。通常,并行计算和通信可以提高模型的运行效率。并发计算和通信任务可能会影响通信带宽。" sdma_baseline: 18 #M threshold: 0.8 top_num: 10 diff --git a/profiler/advisor/rules/cn/byte_alignment.yaml b/profiler/advisor/rules/cn/byte_alignment.yaml index a653d5632..6a4be8e95 100644 --- a/profiler/advisor/rules/cn/byte_alignment.yaml +++ b/profiler/advisor/rules/cn/byte_alignment.yaml @@ -1,4 +1,5 @@ -problem: "{count}个通信算子的数据大小未对齐,这会降低通信性能。" +problem: "字节对齐分析" +description: "{count}个通信算子的数据大小未对齐,这会降低通信性能。" min_size: 512 # byte top_num: 5 solutions: diff --git a/profiler/advisor/rules/cn/environment_variable_info.yaml b/profiler/advisor/rules/cn/environment_variable_info.yaml index d342612db..eedaf8399 100644 --- a/profiler/advisor/rules/cn/environment_variable_info.yaml +++ b/profiler/advisor/rules/cn/environment_variable_info.yaml @@ -1,7 +1,7 @@ ASCEND_GLOBAL_LOG_LEVEL: desc: "日志级别: 0-调试,1-信息,2-警告,3-错误。\n 默认为错误级别。" - suggest: "调试或信息级别可能会导致培训性能下降,\n + suggest: "调试或信息级别可能会导致训练性能下降,\n 建议通过执行命令'export ASCEND_GLOBAL_LOGLEVEL=3来设置错误级别。" HCCL_RDAM_TC: desc: "配置网络端口发送的RoCE数据包的DSCP值。\n diff --git a/profiler/advisor/rules/cn/gc.yaml b/profiler/advisor/rules/cn/gc.yaml index 78238f0d3..295f98766 100644 --- a/profiler/advisor/rules/cn/gc.yaml +++ b/profiler/advisor/rules/cn/gc.yaml @@ -1,3 +1,4 @@ +problem: GC分析 gc_problem_with_count: "检测到异常垃圾收集(GC)事件{gc_count}次,总时间为{gc_total_time}毫秒。\n GC操作耗时且会阻塞整个过程。因此,模型训练过程中的某些步骤比其他步骤需要更长的时间。" gc_problem_with_free: "由于torch_npu的版本较低,在分析时没有收集垃圾收集(GC)数据。但在{free_duration_time}微秒(us)的空闲时间内几乎没有主机任务,这可能是由Python的异常GC引起的。" diff --git a/profiler/advisor/rules/cn/packet.yaml b/profiler/advisor/rules/cn/packet.yaml index 2b0fff4b7..621b0cb07 100644 --- a/profiler/advisor/rules/cn/packet.yaml +++ b/profiler/advisor/rules/cn/packet.yaml @@ -1,4 +1,5 @@ -problem: "过小的通信数据包可能会导致host传递瓶颈。\n" +problem: "包分析" +description: "过小的通信数据包可能会导致host传递瓶颈。\n" sdma_problem: "在SDMA通信中,通信数据量的{abnormal_ratio}小于{min_size}MB,总时间为{abnormal_time}ms。\n" rdma_problem: "在RDMA通信中,通信数据量的{abnormal_ratio}小于{min_size}MB,总时间为{abnormal_time}ms。\n" min_sdma_size: 16 #M diff --git a/profiler/advisor/rules/cn/rdma_analysis.yaml b/profiler/advisor/rules/cn/rdma_analysis.yaml index c5a7bd14f..1492b4235 100644 --- a/profiler/advisor/rules/cn/rdma_analysis.yaml +++ b/profiler/advisor/rules/cn/rdma_analysis.yaml @@ -1,4 +1,5 @@ -problem: "发生RDMA通信重传。单次重传需要4秒以上。重传问题在{group_count}通信域中检测到。\n建议执行以下建议。" +problem: "通信重传分析" +description: "发生RDMA通信重传。单次重传需要4秒以上。重传问题在{group_count}通信域中检测到。\n建议执行以下建议。" min_retransmission_time: 4000 #ms solutions: - 检查RDMA传输时长: diff --git a/profiler/advisor/rules/en/bandwidth_contention.yaml b/profiler/advisor/rules/en/bandwidth_contention.yaml index 684ac22e9..fd3f75573 100644 --- a/profiler/advisor/rules/en/bandwidth_contention.yaml +++ b/profiler/advisor/rules/en/bandwidth_contention.yaml @@ -1,4 +1,5 @@ -problem: "The SDMA bandwidth is lower than {threshold} GB/s when computing and communication tasks are performed \n +problem: "Bandwidth Contention Analysis" +description: "The SDMA bandwidth is lower than {threshold} GB/s when computing and communication tasks are performed \n concurrently. Generally, parallel computing and communication improves the running efficiency of the model. \n Concurrent computing and communication tasks may affect the communication bandwidth." sdma_baseline: 18 #M diff --git a/profiler/advisor/rules/en/byte_alignment.yaml b/profiler/advisor/rules/en/byte_alignment.yaml index 2e81e3cca..bb580f31b 100644 --- a/profiler/advisor/rules/en/byte_alignment.yaml +++ b/profiler/advisor/rules/en/byte_alignment.yaml @@ -1,4 +1,5 @@ -problem: "Found the data size of {count} communication operators is not aligned, which deteriorates the communication performance." +problem: "Byte Alignment Analysis" +description: "Found the data size of {count} communication operators is not aligned, which deteriorates the communication performance." min_size: 512 # byte top_num: 5 solutions: diff --git a/profiler/advisor/rules/en/gc.yaml b/profiler/advisor/rules/en/gc.yaml index d7f178cab..a2e646a98 100644 --- a/profiler/advisor/rules/en/gc.yaml +++ b/profiler/advisor/rules/en/gc.yaml @@ -1,3 +1,4 @@ +problem: GC Analysis gc_problem_with_count: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n. The GC operation is time-consuming and blocks the entire process. As a result, some steps in the model training process take a longer time than other steps." gc_problem_with_free: "Due to the lower version of torch_npu, no garbage collection(GC) data was collected while profiling. But nearly no host tasks within {free_duration_time} microseconds(us) of free time, which is likely caused by abnorma GC of Python" diff --git a/profiler/advisor/rules/en/packet.yaml b/profiler/advisor/rules/en/packet.yaml index d93809882..c74cd16fd 100644 --- a/profiler/advisor/rules/en/packet.yaml +++ b/profiler/advisor/rules/en/packet.yaml @@ -1,4 +1,5 @@ -problem: "Excessive small communication packets may cause host delivery bottlenecks.\n" +problem: "Packet analysis" +description: "Excessive small communication packets may cause host delivery bottlenecks.\n" sdma_problem: "In the SDMA communication, {abnormal_ratio} of the communication data volume is less than {min_size} MB, and the total time is {abnormal_time} ms.\n" rdma_problem: "In the RDMA communication, {abnormal_ratio} of the communication data volume is less than {min_size} MB, and the total time is {abnormal_time} ms." min_sdma_size: 16 #M diff --git a/profiler/advisor/rules/en/rdma_analysis.yaml b/profiler/advisor/rules/en/rdma_analysis.yaml index 6c6062775..a21f9fa98 100644 --- a/profiler/advisor/rules/en/rdma_analysis.yaml +++ b/profiler/advisor/rules/en/rdma_analysis.yaml @@ -1,4 +1,5 @@ -problem: "RDMA communication retransmission occurs. A single retransmission takes more than 4s. Retransmission problems +problem: "Communication retransmission analysis" +description: "RDMA communication retransmission occurs. A single retransmission takes more than 4s. Retransmission problems are detected in {group_count} communication domains. \n Advised to perform the following suggestions" min_retransmission_time: 4000 #ms diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index 59a4da80c..cc32f5495 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -46,10 +46,12 @@ def analyze_cli(**kwargs): @click.option("--force", is_flag=True, help="Indicates whether to skip file size verification and owner verification") -@click.option("--language", "-l", +@click.option("-l", + "--language", type=click.Choice(["cn", "en"]), - default="cn", - help="Language of Advisor html") + required=False, + default="en", + help="Language of the profiling advisor.") @debug_option def analyze_all(**kwargs) -> None: try: @@ -77,13 +79,18 @@ def analyze_all(**kwargs) -> None: @click.option("-pt", "--profiling_type", metavar="", - default=EnumParamsParser().get_default(Constant.PROFILING_TYPE_UNDER_LINE), required=False, type=click.Choice(EnumParamsParser().get_options(Constant.PROFILING_TYPE_UNDER_LINE)), help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") @click.option("--force", is_flag=True, help="Indicates whether to skip file size verification and owner verification") +@click.option("-l", + "--language", + type=click.Choice(["cn", "en"]), + required=False, + default="en", + help="Language of the profiling advisor.") @debug_option def analyze_schedule(**kwargs) -> None: try: @@ -111,13 +118,18 @@ def analyze_schedule(**kwargs) -> None: @click.option("-pt", "--profiling_type", metavar="", - default=EnumParamsParser().get_default(Constant.PROFILING_TYPE_UNDER_LINE), required=False, type=click.Choice(EnumParamsParser().get_options(Constant.PROFILING_TYPE_UNDER_LINE)), help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") @click.option("--force", is_flag=True, help="Indicates whether to skip file size verification and owner verification") +@click.option("-l", + "--language", + type=click.Choice(["cn", "en"]), + required=False, + default="en", + help="Language of the profiling advisor.") @debug_option def analyze_computation(**kwargs) -> None: try: diff --git a/profiler/prof_common/additional_args_manager.py b/profiler/prof_common/additional_args_manager.py index 669ab428f..1bc8a9815 100644 --- a/profiler/prof_common/additional_args_manager.py +++ b/profiler/prof_common/additional_args_manager.py @@ -36,7 +36,7 @@ from profiler.advisor.utils.utils import singleton class AdditionalArgsManager: def __init__(self): self._args = None - self._language = "cn" + self._language = "en" self._force = False @property @@ -50,4 +50,4 @@ class AdditionalArgsManager: def init(self, args: Dict): self._args = args self._force = self._args.get("force", False) - self._language = self._args.get("language", "cn") + self._language = self._args.get("language", "en") diff --git a/profiler/prof_common/constant.py b/profiler/prof_common/constant.py index 336ebfb47..cab26cd35 100644 --- a/profiler/prof_common/constant.py +++ b/profiler/prof_common/constant.py @@ -283,12 +283,6 @@ class Constant(object): TIMELINE_ACL_TO_NPU_NO_STACK_CODE: "Incoming flow is 'acl_to_npu', without call stacks in profiling." } AFFINITY_TRAINING_API = "Affinity training api" - TIMELINE_EMPTY_STACKS_PROMPT = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ - "please refer to {timeline_profiling_doc_url} to set 'with_stack=True'. " \ - "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." - TIMELINE_EMPTY_STACKS_PROMPT_CN = "这些API接口没有代码堆栈。如果采集profiling时参数为'with_stack=False'," \ - "请参考{timeline_profiling_doc_url}设置'with_stack=True'。" \ - "另外,由于反向传播没有堆栈,请忽略以下亲和APIs。" CLUSTER_ANALYSIS = "Cluster analysis" SLOW_RANK_TIME_RATIO_THRESHOLD = 0.05 @@ -388,95 +382,3 @@ class Constant(object): PYTORCH = "pytorch" MINDSPORE = "mindspore" - # overall summary advice - ADVISOR_ADVICE_MAP = { - "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", - "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", - "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." - } - TIME_NAME_MAP = { - "Computing Time": "computing", - "Uncovered Communication Time": "communication", - "Free Time": "free", - 'Cube Time(Num)': 'Cube Time', - 'Vector Time(Num)': 'Vector Time', - 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', - 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', - 'Other Time': "Other Computing Time", - 'SDMA Time(Num)': 'SDMA Time' - } - ADVISOR_PERFORMANCE_TIME_DICT = { - "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', - 'Flash Attention Time(Backward)(Num)', 'Other Time'], - "Uncovered Communication Time(Wait Time)": [], - "Free Time": ['SDMA Time(Num)'] - } - ADVISOR_ADVICE_MAP_CN = { - "计算时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor computation.", - "未被掩盖的通信时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule.", - "空闲时长": "如果你想了解更多详细建议请使用 msprof-analyze advisor schedule." - } - TIME_NAME_MAP_CN = { - "计算时长": "computing", - "未被掩盖的通信时长": "communication", - "空闲时长": "free", - 'Cube算子时长(数量)': 'Cube Time', - 'Vector算子时长(数量)': 'Vector Time', - 'Flash Attention算子时长(前向)(数量)': 'Flash Attention Time(Forward)', - 'Flash Attention算子时长(反向)(数量)': 'Flash Attention Time(Backward)', - '其它时长': "Other Computing Time", - 'SDMA时长(数量)': 'SDMA Time' - } - ADVISOR_PERFORMANCE_TIME_DICT_CN = { - "计算时长": ['Cube时长(数量)', 'Vector时长(数量)', 'Flash Attention时长(前向)(数量)', - 'Flash Attention时长(反向)(数量)', '其它时长'], - "未被掩盖的通信时长(等待时长)": [], - "空闲时长": ['SDMA Time(Num)'] - } - - # overall summary analyzer - OVERALL_SUMMARY_ANALYZER = "overall summary" - ANALYZER_ADVICE_MAP = { - "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", - "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", - "Free Time": "if you want more detailed advice please go to mstt_advisor_*.html" - } - ANALYZER_PERFORMANCE_TIME_DICT = { - "Computing Time": "computing_time_ms", - " -- Flash Attention": "fa_time_ms", - " -- Conv": "conv_time_ms", - " -- Matmul": "matmul_time_ms", - " -- Vector": "vector_time_ms", - " -- SDMA(Tensor Move)": "tensor_move_time_ms", - " -- Other Cube": "other_cube_time_ms", - "Uncovered Communication Time": "uncovered_communication_time_ms", - " -- Wait": "wait_time_ms", - " -- Transmit": "transmit_time_ms", - "Free Time": "free_time_ms", - " -- SDMA": "sdma_time_ms", - " -- Free": "free_ms", - "E2E Time": "e2e_time_ms" - } - OVERALL_SUMMARY_ANALYZER_CN = "整网耗时分析" - ANALYZER_ADVICE_MAP_CN = { - "计算时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", - "未被掩盖的通信时长": "如果你想了解更多详细建议请看mstt_advisor_*.html", - "空闲时长": "如果你想了解更多详细建议请看mstt_advisor_*.html" - } - ANALYZER_PERFORMANCE_TIME_DICT_CN = { - "计算时长": "computing_time_ms", - " -- Flash Attention": "fa_time_ms", - " -- Conv": "conv_time_ms", - " -- Matmul": "matmul_time_ms", - " -- Vector": "vector_time_ms", - " -- SDMA(Tensor Move)": "tensor_move_time_ms", - " -- 其它Cube": "other_cube_time_ms", - "未被掩盖的通信时长": "uncovered_communication_time_ms", - " -- 等待时长": "wait_time_ms", - " -- 传输时长": "transmit_time_ms", - "空闲时长": "free_time_ms", - " -- SDMA": "sdma_time_ms", - " -- 空闲时长": "free_ms", - "E2E时长": "e2e_time_ms" - } - -- Gitee From 22905a4f27940da85863b106a77b33fa9c71c06b Mon Sep 17 00:00:00 2001 From: xiao-yamin Date: Wed, 18 Dec 2024 15:47:09 +0800 Subject: [PATCH 8/8] =?UTF-8?q?=E9=87=8D=E6=9E=842,=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E6=A0=87=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../alignment/byte_alignment_checker.py | 12 ++----- .../bandwidth_contention_checker.py | 6 +--- .../computation/aicpu/aicpu_checker.py | 19 ++++------ .../computation/bound/block_dim_checker.py | 36 ++++++++++--------- .../bound/operator_bound_checker.py | 22 ++++++++---- .../analyzer/computation/operator_checker.py | 3 -- .../overall/overall_summary_analyzer.py | 17 +++------ .../display/prompt/cn/block_dim_prompt.py | 21 +++++++++++ .../prompt/cn/operator_bound_prompt.py | 18 ++++++++++ .../display/prompt/en/ai_core_freq_prompt.py | 16 +-------- .../display/prompt/en/block_dim_prompt.py | 20 +++++++++++ .../prompt/en/operator_bound_prompt.py | 19 ++++++++++ profiler/advisor/result/result.py | 6 +++- profiler/advisor/rules/cn/aicpu_rules.yaml | 4 +++ profiler/advisor/rules/en/aicpu_rules.yaml | 4 +++ 15 files changed, 142 insertions(+), 81 deletions(-) create mode 100644 profiler/advisor/display/prompt/cn/block_dim_prompt.py create mode 100644 profiler/advisor/display/prompt/cn/operator_bound_prompt.py create mode 100644 profiler/advisor/display/prompt/en/block_dim_prompt.py create mode 100644 profiler/advisor/display/prompt/en/operator_bound_prompt.py diff --git a/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py b/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py index 86b79c4fd..b99e91df9 100644 --- a/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py +++ b/profiler/advisor/analyzer/communication/alignment/byte_alignment_checker.py @@ -51,15 +51,9 @@ class ByteAlignmentChecker: self.abnormal_ops = [] self.suggestions = [] self._init_rule() - language = AdditionalArgsManager().language - if language == "en": - self.headers = [ - "op name", "total size(Byte)", "duration(us)", "abnormal duration(us)", "bandwidth(GB/s)" - ] - else: - self.headers = [ - "算子名称", "总大小(Byte)", "时长(us)", "异常时长(us)", "带宽(GB/s)" - ] + self.headers = [ + "op name", "total size(Byte)", "duration(us)", "abnormal duration(us)", "bandwidth(GB/s)" + ] @staticmethod def _calculate_bandwidth_gb_s(size, duration): diff --git a/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py b/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py index f692d52a7..9897b5ee6 100644 --- a/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py +++ b/profiler/advisor/analyzer/communication/contention/bandwidth_contention_checker.py @@ -72,11 +72,7 @@ class BandwidthContentionChecker: self.abnormal_sdma_list: List[SDMAOperator] = [] self.suggestions = [] self._init_rule() - language = AdditionalArgsManager().language - if language == "en": - self.headers = ["op name", "duration(ms)", "bandwidth(GB/s)"] - else: - self.headers = ["算子名称", "时长(ms)", "带宽(GB/s)"] + self.headers = ["op name", "duration(ms)", "bandwidth(GB/s)"] @staticmethod def check_sdma_operator(hccl_op: HcclInfo): diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py index 8b1171664..752c5b38c 100644 --- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -28,13 +28,7 @@ from profiler.prof_common.constant import Constant class AicpuChecker(OperatorChecker): _CHECKER = "aicpu operator" - _PROBLEM = "AICPU operator" - _PROBLEM_CN = "AICPU算子" _MIN_TASK_DURATION = 20 - _description = f"Some operators and task duration exceed {_MIN_TASK_DURATION} us, such as :\n" - _description_cn = f"一些算子和任务执行时间超过了{_MIN_TASK_DURATION}us,比如:\n" - _SUGGESTION: List[str] = ["Modify code to avoid aicpu operator"] - _SUGGESTION_CN: List[str] = ["修改代码避免使用aicpu类算子"] STACK_INFO_ITEMS = "stack_info" SUGGESTION_INFO_ITEMS = "suggestions" _ITEMS = [ @@ -49,6 +43,7 @@ class AicpuChecker(OperatorChecker): self.load_aicpu_rules() self.total_task_duration = 0.0 self.aicpu_task_duration = 0.0 + self.double_suggestion = None def load_aicpu_rules(self): language = AdditionalArgsManager().language @@ -61,6 +56,10 @@ class AicpuChecker(OperatorChecker): logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path) self.aicpu_rules = FileManager.read_yaml_file(rule_path) + self._PROBLEM = self.aicpu_rules.get("problem") + self._description = self.aicpu_rules.get("description").format(self._MIN_TASK_DURATION) + self._SUGGESTION = [self.aicpu_rules.get("suggestion")] + self.double_suggestion = self.aicpu_rules.get("double_suggestion") self.filter_aicpu_rules(self.aicpu_rules) for checker_name, check_rule in self.aicpu_rules.items(): if not isinstance(check_rule, (list, dict,)): @@ -158,13 +157,7 @@ class AicpuChecker(OperatorChecker): and op.op_name not in double_type_ai_cpu_operator): double_type_ai_cpu_operator.append(op.op_name) if bool(double_type_ai_cpu_operator): - language = AdditionalArgsManager().language - if language == "en": - self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format( - ",".join(double_type_ai_cpu_operator))) - else: - self._SUGGESTION.append("尝试将double类型的算子转换成float,比如{}".format( - ",".join(double_type_ai_cpu_operator))) + self._SUGGESTION.append(self.double_suggestion.format(",".join(double_type_ai_cpu_operator))) return True def make_render(self, html_render, record, add_render_list=True, **kwargs): diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index 2e2877717..6b83ef88e 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -27,17 +27,29 @@ logger = logging.getLogger() class BlockDimChecker(OperatorChecker): _SUGGESTION: List[str] = [] _CHECKER = "block dim" - _PROBLEM = "block dim" - _PROBLEM_CN = "AICore核数" _aicore_num = 0 _aiv_num = 0 - _description = "some operator does not make full use of {} ai core" - _description_cn = "一些算子没有充分利用{}个AICore核" _ITEMS = [ "op_name", "op_type", "task_type", "task_duration", "income", "block_dim", "mix_block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" ] + def __init__(self, cann_version): + super(BlockDimChecker, self).__init__(cann_version=cann_version) + self._init_prompt_by_language() + + def _init_prompt_by_language(self): + language = AdditionalArgsManager().language + if language == "en": + from profiler.advisor.display.prompt.en.block_dim_prompt import BlockDimPrompt + else: + from profiler.advisor.display.prompt.cn.block_dim_prompt import BlockDimPrompt + + self._PROBLEM = BlockDimPrompt.PROBLEM + self._description = BlockDimPrompt.DESCRIPTION + self.aiv_num_desc = BlockDimPrompt.AIV_NUM_DESCRIPTION + self.top_duration_op_desc = BlockDimPrompt.TOP_DURATION_OP_DESCRIPTION + def pre_check(self, profiling_data) -> bool: return not self.is_dynamic_shape(profiling_data) @@ -86,18 +98,10 @@ class BlockDimChecker(OperatorChecker): except ValueError as e: logger.warning("get aiv_num failed, please check info.json: %s", e) - language = AdditionalArgsManager().language - if language == "en": - self._description = self._description.format(self._aicore_num) - if self._aiv_num: - self._description += f" or {self._aiv_num} ai vector core" - self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \ - "task duration are as follows:\n" - else: - self._description_cn = self._description_cn.format(self._aicore_num) - if self._aiv_num: - self._description_cn += f"或者{self._aiv_num}个AIVector核" - self._description_cn += f";\n 任务耗时最长的{OperatorChecker._MAX_TUNE_OP_NUM}个算子如下:" + self._description = self._description.format(self._aicore_num) + if self._aiv_num: + self._description += self.aiv_num_desc.format(self._aiv_num) + self._description += self.top_duration_op_desc.format(OperatorChecker._MAX_TUNE_OP_NUM) return True def _check_operator(self, op_info) -> bool: diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index fab954a9c..4aeb7a432 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -16,6 +16,7 @@ import logging from typing import List from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.constant import Constant from profiler.advisor.config.config import Config from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset @@ -27,20 +28,27 @@ logger = logging.getLogger() class OperatorBoundChecker(OperatorChecker): _MIN_TASK_DURATION = 20 # min task duration 20us _CHECKER = "operator no bound" - _PROBLEM = "operator no bound" - _PROBLEM_CN = "算子瓶颈" _SUGGESTION: List[str] = [] - _description = ( - f"There is no mte, cube, vector, scalar ratio is more than {to_percent(Config().operator_bound_ratio)};\n" + - f"Top task duration operators need to be tuned are as follows: \n") - _description_cn = ("mte,cube,vetor,scalar比都没有超过 {to_percent(Config().operator_bound_ratio)};\n" - "需要调整的任务执行时间最长的算子如下: \n") _ITEMS = [ "op_name", "op_type", "task_type", "task_duration", "vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio", "block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" ] + def __init__(self, cann_version) -> None: + super().__init__(cann_version=cann_version) + self._init_prompt_by_language() + + def _init_prompt_by_language(self): + language = AdditionalArgsManager().language + if language == "en": + from profiler.advisor.display.prompt.en.operator_bound_prompt import OperatorBoundPrompt + else: + from profiler.advisor.display.prompt.cn.operator_bound_prompt import OperatorBoundPrompt + + self._PROBLEM = OperatorBoundPrompt.PROBLEM + self._description = OperatorBoundPrompt.DESCRIPTION.format(to_percent(Config().operator_bound_ratio)) + def pre_check(self, profiling_data) -> bool: return not self.is_dynamic_shape(profiling_data) diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 1be199b82..e70b13b18 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -38,13 +38,10 @@ class OperatorChecker(VersionControl): _MIN_TOTAL_DURATION_RATIO = 1.0 _CHECKER = str() _PROBLEM = str() - _PROBLEM_CN = str() _description = str() - _description_cn = str() STACK_INFO_ITEMS = "" _ITEMS: List[str] = [] _SUGGESTION: List[str] = [] - _SUGGESTION_CN: List[str] = [] SKIP_CHECK_MSG = "Skip %s checker because of not containing %s" _tune_op_info_list: List[OpInfo] = [] diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index 7c85f200d..3c6952694 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -47,9 +47,9 @@ class OverallSummaryAnalyzer(BaseAnalyzer): def _init_prompt_by_language(self): language = AdditionalArgsManager().language if language == "en": - from profiler.advisor.display.prompt.en.overall_summary_advice_prompt import OverallSummaryAnalyzePrompt + from profiler.advisor.display.prompt.en.overall_summary_analyzer_prompt import OverallSummaryAnalyzePrompt else: - from profiler.advisor.display.prompt.cn.overall_summary_advice_prompt import OverallSummaryAnalyzePrompt + from profiler.advisor.display.prompt.cn.overall_summary_analyzer_prompt import OverallSummaryAnalyzePrompt self.over_summary_analyzer = OverallSummaryAnalyzePrompt.OVERALL_SUMMARY_ANALYZER self.advice_map = OverallSummaryAnalyzePrompt.PERFORMANCE_TIME_DICT @@ -156,18 +156,11 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.bottleneck_str = result def format_over_summary_analysis(self): - language = AdditionalArgsManager().language - if language == "en": - headers = ['Performance Index', 'Duration(ms)', 'Duration Ratio'] - else: - headers = ['性能分析', '时长(ms)', '时长占比'] + headers = ['Performance Index', 'Duration(ms)', 'Duration Ratio'] performance_data = self.get_analysis_data(self._disaggregate_perf) benchmark_data = self.get_analysis_data(self._disaggregate_benchmark_perf) if self._has_benchmark_profiling: - if language == "en": - headers.append('Diff Duration(ms)') - else: - headers.append('时长差距(ms)') + headers.append('Diff Duration(ms)') self.format_analysis_with_benchmark(performance_data, benchmark_data, headers) else: self.format_analysis_only(performance_data, headers) @@ -238,7 +231,7 @@ class OverallSummaryAnalyzer(BaseAnalyzer): "details": [self.over_summary_analysis] } self.html_render.render_template(key="overall", - title=self.over_summary_analyzer, + title="Overall Summary", template_dir="templates", template_name="cluster_analysis.html", cann_version=self.cann_version, diff --git a/profiler/advisor/display/prompt/cn/block_dim_prompt.py b/profiler/advisor/display/prompt/cn/block_dim_prompt.py new file mode 100644 index 000000000..eb0c87ebb --- /dev/null +++ b/profiler/advisor/display/prompt/cn/block_dim_prompt.py @@ -0,0 +1,21 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class BlockDimPrompt(object): + PROBLEM = "AICore核数" + DESCRIPTION = "一些算子没有充分利用{}个AICore核" + AIV_NUM_DESCRIPTION = "或者{}个AIVector核" + TOP_DURATION_OP_DESCRIPTION = ";\n 任务耗时最长的{}个算子如下:" + diff --git a/profiler/advisor/display/prompt/cn/operator_bound_prompt.py b/profiler/advisor/display/prompt/cn/operator_bound_prompt.py new file mode 100644 index 000000000..76257e36a --- /dev/null +++ b/profiler/advisor/display/prompt/cn/operator_bound_prompt.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OperatorBoundPrompt(object): + PROBLEM = "算子瓶颈" + DESCRIPTION = "mte,cube,vetor,scalar比都没有超过 {},需要调整的任务执行时间最长的算子如下:\n" diff --git a/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py b/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py index 62a9507c0..7737a372a 100644 --- a/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py +++ b/profiler/advisor/display/prompt/en/ai_core_freq_prompt.py @@ -19,18 +19,4 @@ class AICoreFreqPrompt(object): DESCRIPTION = "{} operators are found during frequency reduction, and the reduction " \ "ratio is larger than {}." RANK_DESCRIPTION = "For rank {}, " - SUGGESTION = "Please check the temperature or max power of your machine." - - -if language == "en": - self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " - f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") - if self.rank: - self.desc = f"For rank {self.rank}, " + self.desc.lower() - self.suggestions = "Please check the temperature or max power of your machine." -else: - self.desc = ( - f"在降频期间发现{len(self.decrease_freq_ops)}个算子,频率降低比例超过了{self.DECREASE_FREQ_RATIO}。") - if self.rank: - self.desc = f"对于{self.rank}号卡," + self.desc - self.suggestions = "请检查您的机器温度或最大功率。" \ No newline at end of file + SUGGESTION = "Please check the temperature or max power of your machine." \ No newline at end of file diff --git a/profiler/advisor/display/prompt/en/block_dim_prompt.py b/profiler/advisor/display/prompt/en/block_dim_prompt.py new file mode 100644 index 000000000..410fcdd41 --- /dev/null +++ b/profiler/advisor/display/prompt/en/block_dim_prompt.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class BlockDimPrompt(object): + PROBLEM = "block dim" + DESCRIPTION = "some operator does not make full use of {} ai core" + AIV_NUM_DESCRIPTION = " or {} ai vector core" + TOP_DURATION_OP_DESCRIPTION = ";\n Top-{} operator of task duration are as follows:\n" diff --git a/profiler/advisor/display/prompt/en/operator_bound_prompt.py b/profiler/advisor/display/prompt/en/operator_bound_prompt.py new file mode 100644 index 000000000..f4f29f25d --- /dev/null +++ b/profiler/advisor/display/prompt/en/operator_bound_prompt.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class OperatorBoundPrompt(object): + PROBLEM = "operator no bound" + DESCRIPTION = "There is no mte, cube, vector, scalar ratio is more than {},\n" \ + "Top task duration operators need to be tuned are as follows: \n" diff --git a/profiler/advisor/result/result.py b/profiler/advisor/result/result.py index 40324d034..74b9358f6 100644 --- a/profiler/advisor/result/result.py +++ b/profiler/advisor/result/result.py @@ -153,7 +153,11 @@ class OptimizeResult: self._tune_op_list.append(operator_name) def add(self, overview_item): - sheet_name = "problems" + language = AdditionalArgsManager().language + if language == "en": + sheet_name = "problems" + else: + sheet_name = "问题综述" headers = overview_item.headers data = overview_item.data diff --git a/profiler/advisor/rules/cn/aicpu_rules.yaml b/profiler/advisor/rules/cn/aicpu_rules.yaml index 4d69b0642..f14fc044d 100644 --- a/profiler/advisor/rules/cn/aicpu_rules.yaml +++ b/profiler/advisor/rules/cn/aicpu_rules.yaml @@ -1,3 +1,7 @@ +problem: "AICPU算子" +description: "一些算子和任务执行时间超过了{}us,比如:\n" +suggestion: "修改代码避免使用aicpu类算子" +double_suggestion: "尝试将double类型的算子转换成float,比如{}" DataTypeSuggeation: &DataTypeSuggeation "数据类型{}在{}算子中可能会造成AICpu问题, 如果可以,尝试转换成{}。" AICPU_DOC_URL: &AICPU_DOC_URL "https://gitee.com/ascend/mstt/blob/master/profiler/advisor/doc/Samples%20of%20AI%20CPU%20Operator%20Replacement.md" diff --git a/profiler/advisor/rules/en/aicpu_rules.yaml b/profiler/advisor/rules/en/aicpu_rules.yaml index bdbb71fec..d1869a25e 100644 --- a/profiler/advisor/rules/en/aicpu_rules.yaml +++ b/profiler/advisor/rules/en/aicpu_rules.yaml @@ -1,3 +1,7 @@ +problem: "AICPU operator" +description: "Some operators and task duration exceed {} us, such as :\n" +suggestion: "Modify code to avoid aicpu operator" +double_suggestion: "Try to convert double type operator to float, such as {}" DataTypeSuggeation: &DataTypeSuggeation "Data type {} in {} operator may cause AICPU issues, Try to convert to {} if possible." AICPU_DOC_URL: &AICPU_DOC_URL "https://gitee.com/ascend/mstt/blob/master/profiler/advisor/doc/Samples%20of%20AI%20CPU%20Operator%20Replacement.md" -- Gitee