diff --git a/docs/sample_code/profiler/mstx_profiler.py b/docs/sample_code/profiler/mstx_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..20c259d38b76e715f911e336a6940a35e1620381 --- /dev/null +++ b/docs/sample_code/profiler/mstx_profiler.py @@ -0,0 +1,74 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""mstx Profiler Example""" +import numpy as np + +import mindspore +import mindspore.dataset as ds +from mindspore import nn +from mindspore.profiler import ProfilerLevel, ProfilerActivity, schedule, tensorboard_trace_handler, mstx + + +class Net(nn.Cell): + def __init__(self): + super(Net, self).__init__() + self.fc = nn.Dense(2, 2) + + def construct(self, x): + return self.fc(x) + + +def generator_net(): + for _ in range(10): + yield np.ones([2, 2]).astype(np.float32), np.ones([2]).astype(np.int32) + + +def forward_fn(data, label): + logits = model(data) + mstx.mark("backward_begin") + loss = loss_fn(logits, label) + return loss, logits + + +def train_step(data, label): + range_id1 = mstx.range_start("forward_and_backward") + (loss, _), grads = grad_fn(data, label) + mstx.range_end(range_id1) + range_id2 = mstx.range_start("optimizer_step") + optimizer(grads) + mstx.range_end(range_id2) + return loss + + +if __name__ == "__main__": + mindspore.set_device("Ascend") + model = Net() + optimizer = nn.Momentum(model.trainable_params(), 1, 0.9) + grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True) + loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=True) + stream = mindspore.runtime.current_stream() + # pylint: disable=protected-access + experimental_config = mindspore.profiler._ExperimentalConfig( + profiler_level=ProfilerLevel.LevelNone, + mstx=True) + with mindspore.profiler.profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.NPU], + schedule=schedule(wait=0, warmup=1, active=1, repeat=1, skip_first=0), + on_trace_ready=tensorboard_trace_handler("./data"), + experimental_config=experimental_config + ) as profiler: + for step_data, step_label in ds.GeneratorDataset(generator_net(), ["data", "label"]): + train_step(step_data, step_data) + profiler.step() diff --git a/tutorials/source_en/debug/profiler.md b/tutorials/source_en/debug/profiler.md index 8fd491f14209808cffd6db471afee3bda2812606..5807116aedd01370c6e98fa013c5111979688b30 100644 --- a/tutorials/source_en/debug/profiler.md +++ b/tutorials/source_en/debug/profiler.md @@ -18,7 +18,7 @@ This tutorial introduces how to use MindSpore Profiler for performance tuning on ## Usage -There are four ways to collect training performance data, and the following describes how to use Profiler enablement depending on the scenario. +There are five ways to collect training performance data, and the following describes how to use Profiler enablement depending on the scenario. ### Method 1: mindspore.Profiler Interface Enabling @@ -181,6 +181,27 @@ from mindspore.profiler.profiler import analyse analyse("./profiler_data_path") # './profiler_data_path' is the data path ``` +### Method 5: Lightweight Marking + +To address the traditional profiler process being time-consuming and dealing with large amounts of data in large cluster scenarios, MindSpore 2.5 offers a lightweight profiler capability to assist in obtaining performance data for critical model metrics in a lightweight manner for large-scale clusters. As illustrated in the following figure, users can customize marking through the mstx.mark, mstx.range_start, and mstx.range_end interfaces, and also support built-in marking of communication operators. When users enable the lightweight marking function, marking is automatically performed before and after the communication operators. All marking tasks are issued by the runtime to the device side, which can present the time points or time slices of the marking tasks on the host side and the device side. + +![mstx_profiler.png](../../source_zh_cn/debug/images/mstx_profiler.png) + +For details about the mstx interface, please refer to [mstx API](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.profiler.mstx.html). + +The lightweight marking sample is shown below: + +```python +from mindspore.profiler import mstx + +range_id = mstx.range_start("train") +mstx.mark("start") +# train_step +mstx.range_end(range_id) +``` + +For the complete case, refer to [mstx lightweight marking method case](https://gitee.com/mindspore/docs/blob/master/docs/sample_code/profiler/mstx_profiler.py). + ## Performance Data Users can collect, parse, and analyze performance data through MindSpore Profiler, including raw performance data from the framework side, CANN side, and device side, as well as parsed performance data. diff --git a/tutorials/source_zh_cn/debug/images/mstx_profiler.png b/tutorials/source_zh_cn/debug/images/mstx_profiler.png new file mode 100644 index 0000000000000000000000000000000000000000..08c8e79abf974a2d04be42c75cc36614a1ee0947 Binary files /dev/null and b/tutorials/source_zh_cn/debug/images/mstx_profiler.png differ diff --git a/tutorials/source_zh_cn/debug/profiler.md b/tutorials/source_zh_cn/debug/profiler.md index 0923219e9ef4728c05eb42b025d6a03d6aa6d597..89b59c545f17b9d6250f270d028038d2699fa611 100644 --- a/tutorials/source_zh_cn/debug/profiler.md +++ b/tutorials/source_zh_cn/debug/profiler.md @@ -18,7 +18,7 @@ ## 使用方法 -收集训练性能数据有四种方式,以下将介绍根据不同场景下,使用Profiler使能的方式。 +收集训练性能数据有五种方式,以下将介绍根据不同场景下,使用Profiler使能的方式。 ### 方式一:mindspore.profiler.profile接口使能 @@ -181,6 +181,27 @@ from mindspore.profiler.profiler import analyse analyse("./profiler_data_path") # './profiler_data_path'为离线解析数据路径 ``` +### 方式五:轻量化打点 + +针对大集群场景传统Profiler流程重、数据量大的问题,提供轻量化Profiler能力,帮助大集群场景轻量化获取模型关键指标性能数据。如下图所示,用户可通过mstx.mark、mstx.range_start、mstx.range_end接口自定义打点,同时支持通信算子的内置打点,用户开启轻量化打点功能,通讯算子前后将自动实现打点。所有的打点任务由runtime下发至device侧,可呈现打点任务在host侧和device侧的时间点或时间片。 + +![mstx_profiler.png](./images/mstx_profiler.png) + +mstx接口详细介绍请参考[mstx API文档](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.profiler.mstx.html)。 + +轻量化打点样例如下: + +```python +from mindspore.profiler import mstx + +range_id = mstx.range_start("train") +mstx.mark("start") +# train_step +mstx.range_end(range_id) +``` + +完整案例请参考[mstx轻量化打点方式案例](https://gitee.com/mindspore/docs/blob/master/docs/sample_code/profiler/mstx_profiler.py)。 + ## 性能数据 用户通过MindSpore Profiler采集、解析后的性能数据包括框架侧、CANN侧和device侧的原始性能数据,以及解析后的性能数据。