From bf8d810d6ad213473e78a2843833be3503179ec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B7=A5=E5=85=B7=E4=BA=BA=E5=91=B5=E5=91=B5?= <1340909670@qq.com> Date: Mon, 14 Oct 2024 01:34:37 +0000 Subject: [PATCH 01/15] =?UTF-8?q?=E8=A7=84=E8=8C=83=E5=92=8C=E4=BD=8E?= =?UTF-8?q?=E9=94=99=E7=B1=BB=EF=BC=9A=20-=20=20=E9=94=99=E5=88=AB?= =?UTF-8?q?=E5=AD=97=E6=88=96=E6=8B=BC=E5=86=99=E9=94=99=E8=AF=AF=EF=BC=8C?= =?UTF-8?q?=E6=A0=87=E7=82=B9=E7=AC=A6=E5=8F=B7=E4=BD=BF=E7=94=A8=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E3=80=81=E5=85=AC=E5=BC=8F=E9=94=99=E8=AF=AF=E6=88=96?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E5=BC=82=E5=B8=B8=EF=BC=9B=20-=20=20?= =?UTF-8?q?=E9=93=BE=E6=8E=A5=E9=94=99=E8=AF=AF=E3=80=81=E7=A9=BA=E5=8D=95?= =?UTF-8?q?=E5=85=83=E6=A0=BC=E3=80=81=E6=A0=BC=E5=BC=8F=E9=94=99=E8=AF=AF?= =?UTF-8?q?=EF=BC=9B=20-=20=20=E8=8B=B1=E6=96=87=E4=B8=AD=E5=8C=85?= =?UTF-8?q?=E5=90=AB=E4=B8=AD=E6=96=87=E5=AD=97=E7=AC=A6=EF=BC=9B=20-=20?= =?UTF-8?q?=20=E7=95=8C=E9=9D=A2=E5=92=8C=E6=8F=8F=E8=BF=B0=E4=B8=8D?= =?UTF-8?q?=E4=B8=80=E8=87=B4=EF=BC=8C=E4=BD=86=E4=B8=8D=E5=BD=B1=E5=93=8D?= =?UTF-8?q?=E6=93=8D=E4=BD=9C=EF=BC=9B=20-=20=20=E8=A1=A8=E8=BF=B0?= =?UTF-8?q?=E4=B8=8D=E9=80=9A=E9=A1=BA=EF=BC=8C=E4=BD=86=E4=B8=8D=E5=BD=B1?= =?UTF-8?q?=E5=93=8D=E7=90=86=E8=A7=A3=EF=BC=9B=20-=20=20=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E5=8F=B7=E4=B8=8D=E5=8C=B9=E9=85=8D=EF=BC=9A=E5=A6=82?= =?UTF-8?q?=E8=BD=AF=E4=BB=B6=E5=8C=85=E5=90=8D=E7=A7=B0=E3=80=81=E7=95=8C?= =?UTF-8?q?=E9=9D=A2=E7=89=88=E6=9C=AC=E5=8F=B7=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 工具人呵呵 <1340909670@qq.com> --- docs/mindspore/source_zh_cn/orange_pi/environment_setup.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/mindspore/source_zh_cn/orange_pi/environment_setup.md b/docs/mindspore/source_zh_cn/orange_pi/environment_setup.md index 91c7a509f8..fe24da388e 100644 --- a/docs/mindspore/source_zh_cn/orange_pi/environment_setup.md +++ b/docs/mindspore/source_zh_cn/orange_pi/environment_setup.md @@ -64,7 +64,7 @@ 步骤1 Rufus下载。 - 点击[此链接](https://etcher.balena.io/),进行下载、安装。 + 点击[此链接](https://github.com/pbatard/rufus/releases/download/v4.5/rufus-4.5.exe),进行下载、安装。 ### 1.4 选择和烧录镜像 -- Gitee From 463d9815774d59586a119b8669f27801b84c2ae4 Mon Sep 17 00:00:00 2001 From: yuhan Date: Mon, 14 Oct 2024 18:15:31 +0800 Subject: [PATCH 02/15] modify err code --- docs/mindspore/source_en/model_train/parallel/mpirun.md | 2 +- docs/mindspore/source_zh_cn/model_train/parallel/mpirun.md | 2 +- docs/sample_code/startup_method/run_mpirun_2.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/mindspore/source_en/model_train/parallel/mpirun.md b/docs/mindspore/source_en/model_train/parallel/mpirun.md index 04c2db3e34..92d1fbfd31 100644 --- a/docs/mindspore/source_en/model_train/parallel/mpirun.md +++ b/docs/mindspore/source_en/model_train/parallel/mpirun.md @@ -207,7 +207,7 @@ After the configuration is successful, you can start the multi-machine task with ```bash export DATA_PATH=./MNIST_Data/train/ HOSTFILE=$1 - mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout python net.sh + mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout python net.py ``` Execute on one of the nodes: diff --git a/docs/mindspore/source_zh_cn/model_train/parallel/mpirun.md b/docs/mindspore/source_zh_cn/model_train/parallel/mpirun.md index 0996175dbc..19b1970750 100644 --- a/docs/mindspore/source_zh_cn/model_train/parallel/mpirun.md +++ b/docs/mindspore/source_zh_cn/model_train/parallel/mpirun.md @@ -207,7 +207,7 @@ epoch: 0, step: 100, loss is 0.6298542 ```bash export DATA_PATH=./MNIST_Data/train/ HOSTFILE=$1 - mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout python net.sh + mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout python net.py ``` 在其中一个节点执行: diff --git a/docs/sample_code/startup_method/run_mpirun_2.sh b/docs/sample_code/startup_method/run_mpirun_2.sh index e9ccae8ecb..e5ca9cdeb5 100644 --- a/docs/sample_code/startup_method/run_mpirun_2.sh +++ b/docs/sample_code/startup_method/run_mpirun_2.sh @@ -16,4 +16,4 @@ export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ HOSTFILE=$1 mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output \ - --merge-stderr-to-stdout python net.sh + --merge-stderr-to-stdout python net.py -- Gitee From 60bbb534028b62295455c5aecf0f8f2878c37dc1 Mon Sep 17 00:00:00 2001 From: SaiYao Date: Mon, 14 Oct 2024 20:08:25 +0800 Subject: [PATCH 03/15] =?UTF-8?q?=E3=80=90bugfix=E3=80=91=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E6=95=B0=E6=8D=AE=E9=9B=86=E6=96=87=E6=A1=A3=E4=B8=AD?= =?UTF-8?q?=E7=9A=84=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/mindformers/docs/source_en/function/dataset.md | 2 +- docs/mindformers/docs/source_zh_cn/function/dataset.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/mindformers/docs/source_en/function/dataset.md b/docs/mindformers/docs/source_en/function/dataset.md index 4a6780e5ea..74c6140a6e 100644 --- a/docs/mindformers/docs/source_en/function/dataset.md +++ b/docs/mindformers/docs/source_en/function/dataset.md @@ -69,7 +69,7 @@ The following is an example of a MindRecord dataset based on a json format file, """tokenize json file dataset""" content = [] # Read each json data and get its “input_ids”. for line in raw_data: - stripped_line = line.strip() + stripped_line = line['input_ids'].strip() if stripped_line: line_ids = tokenizer(stripped_line)["input_ids"] content.append(line_ids) diff --git a/docs/mindformers/docs/source_zh_cn/function/dataset.md b/docs/mindformers/docs/source_zh_cn/function/dataset.md index 4f5cb2df48..6fa4f5162c 100644 --- a/docs/mindformers/docs/source_zh_cn/function/dataset.md +++ b/docs/mindformers/docs/source_zh_cn/function/dataset.md @@ -69,7 +69,7 @@ MindRecord 模块提供了一些方法帮助用户将不同数据集转换为 Mi """tokenize json file dataset""" content = [] # 读取每个 json 数据,获取其 "input_ids" for line in raw_data: - stripped_line = line.strip() + stripped_line = line['input_ids'].strip() if stripped_line: line_ids = tokenizer(stripped_line)["input_ids"] content.append(line_ids) -- Gitee From fb4fd985a6aef8590107e4a85ef367a725d846ad Mon Sep 17 00:00:00 2001 From: huan <3174348550@qq.com> Date: Tue, 15 Oct 2024 09:49:15 +0800 Subject: [PATCH 04/15] modify the links --- docs/mindspore/source_en/orange_pi/environment_setup.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/mindspore/source_en/orange_pi/environment_setup.md b/docs/mindspore/source_en/orange_pi/environment_setup.md index 876ad78091..405d096cc8 100644 --- a/docs/mindspore/source_en/orange_pi/environment_setup.md +++ b/docs/mindspore/source_en/orange_pi/environment_setup.md @@ -64,7 +64,7 @@ There are two card-making tools balenaEtcher, Rufus, and you can choose any one Step 1 Download Rufus - Click [this link](https://etcher.balena.io/) to download and install. + Click [this link](https://github.com/pbatard/rufus/releases/download/v4.5/rufus-4.5.exe) to download and install. ### 1.4 Selecting and Burning Images -- Gitee From 347fbb2cb2354599ccff6d6cf6701ba62193b177 Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 15 Oct 2024 10:05:22 +0800 Subject: [PATCH 05/15] modify requirement --- docs/mindspore/requirements.txt | 2 +- .../source_zh_cn/model_infer/ms_infer/weight_prepare.md | 2 +- .../source_zh_cn/model_infer/ms_infer/weight_split.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/mindspore/requirements.txt b/docs/mindspore/requirements.txt index 5743688256..15a923bbf3 100644 --- a/docs/mindspore/requirements.txt +++ b/docs/mindspore/requirements.txt @@ -8,4 +8,4 @@ IPython jieba chardet jinja2 < 3.1 -sphinxcontrib-mermaid \ No newline at end of file +sphinxcontrib-mermaid == 0.9.2 \ No newline at end of file diff --git a/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_prepare.md b/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_prepare.md index 1d309946ad..c42c28255d 100644 --- a/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_prepare.md +++ b/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_prepare.md @@ -4,7 +4,7 @@ 模型权重作为大语言模型最为重要的参数,通常直接和模型最终效果强相关,因此获取有效可靠的模型权重文件,成为准备大语言模型推理非常重要的一步。总的来说,获取模型权重文件有两大类方案: -- **自己通过数据集训练权重**:利用MindSpore框架训练能力,以及业务强相关的数据集,从头训练或者对模型进行微调,然后输出模型的权重文件,该方案需要使用MindSpore训练能力,同时需要较大的计算资源来训练模型,比较适合用户自己数据集比较特殊的场景。具体可以参考[MindSpore训练流程](../../model_train/train_process/overview.md) 和[保存模型权重CKPT文件](https://www.mindspore.cn/docs/zh-CN/r2.3.1/api_python/mindspore/mindspore.save_checkpoint.html?highlight=save#mindspore.save_checkpoint)。 +- **自己通过数据集训练权重**:利用MindSpore框架训练能力,以及业务强相关的数据集,从头训练或者对模型进行微调,然后输出模型的权重文件,该方案需要使用MindSpore训练能力,同时需要较大的计算资源来训练模型,比较适合用户自己数据集比较特殊的场景。具体可以参考[MindSpore训练流程](../../model_train/train_process/overview.md) 和[保存模型权重CKPT文件](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.save_checkpoint.html?highlight=save#mindspore.save_checkpoint)。 - **从官网获取预训练模型权重**:从主流模型官方网站上获取预训练好的模型配置、tokenizer和权重文件等,并通过MindSpore框架能力将模型权重转换成MindSpore的CKPT权重文件,作为大语言模型推理的输入。 diff --git a/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_split.md b/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_split.md index f02d570e50..48f35ee872 100644 --- a/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_split.md +++ b/docs/mindspore/source_zh_cn/model_infer/ms_infer/weight_split.md @@ -151,10 +151,10 @@ def save_strategy_file(state_dict, strategy_file_name): raise e ``` -得到推理网络的并行策略文件后,可以根据[执行分布式checkpoint转换](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.1/parallel/model_transformation.html#执行分布式checkpoint转换)方法,将训练权重转换为推理所需权重。 +得到推理网络的并行策略文件后,可以根据[执行分布式checkpoint转换](https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/model_transformation.html#执行分布式checkpoint转换)方法,将训练权重转换为推理所需权重。 具体端到端的权重切分代码工程可以参考[权重切分](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/model_infer/ms_infer/code//param_split.py)。 ## 权重加载 -分布式权重加载可以参考[加载转换得到的checkpoint文件](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.1/parallel/model_transformation.html#加载转换得到的checkpoint文件)教程。 \ No newline at end of file +分布式权重加载可以参考[加载转换得到的checkpoint文件](https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/model_transformation.html#加载转换得到的checkpoint文件)教程。 \ No newline at end of file -- Gitee From a889fe584ad111dfdb74d77b5c30f8830e2f7119 Mon Sep 17 00:00:00 2001 From: Yule100 <2538776509@qq.com> Date: Tue, 15 Oct 2024 10:20:59 +0800 Subject: [PATCH 06/15] =?UTF-8?q?bugfix=20PLOG=E6=97=A5=E5=BF=97=E9=87=8D?= =?UTF-8?q?=E5=AE=9A=E5=90=91=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E6=8B=BC?= =?UTF-8?q?=E5=86=99=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/mindformers/docs/source_en/appendix/env_variables.md | 2 +- docs/mindformers/docs/source_zh_cn/appendix/env_variables.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/mindformers/docs/source_en/appendix/env_variables.md b/docs/mindformers/docs/source_en/appendix/env_variables.md index 6529e3ebb9..5c722a6efb 100644 --- a/docs/mindformers/docs/source_en/appendix/env_variables.md +++ b/docs/mindformers/docs/source_en/appendix/env_variables.md @@ -36,4 +36,4 @@ The following environment variables are supported by MindFormers. | **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0 | Whether to enable lazy inline under non-pipeline parallel. | `0`: turn off lazy inline;
`1`: turn on lazy inline. |The lazy inline feature is only enabled in pipeline parallel mode by default. To enable lazy inline in other parallel modes, set this environment variable to 1. | | **MS_ASCEND_CHECK_OVERFLOW_MODE** | INFNAN_MODE | Sets the overflow detection mode. | `SATURATION_MODE`: saturation mode, saturates to floating-point extremes (+-MAX) when the calculation overflows;
`INFNAN_MODE`: INF/NAN mode, follows the IEEE 754 standard, and outputs INF/NAN calculations as defined. | In large model tuning, the overflow state is aligned PyTorch and it is recommended to use INFNAN_MODE, i.e. export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE.
Try setting this variable to INFNAN_MODE when encountering persistent overflow problems. | | **MF_LOG_SUFFIX** | NA | Set custom suffixes for all log log folders. | Suffix for the log folder. Default: no suffix | Adding a consistent suffix isolates logs across tasks from being overwritten. | -| **PLOG_REDICT_TO_OUTPUT** | False | Controls whether plog logs change storage paths. | `True`: store the logs in the ./output directory;
`False`: Store to the default storage location. | This setting makes it easier to query the plog log. | +| **PLOG_REDIRECT_TO_OUTPUT** | False | Controls whether plog logs change storage paths. | `True`: store the logs in the ./output directory;
`False`: Store to the default storage location. | This setting makes it easier to query the plog log. | diff --git a/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md b/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md index 9dc9a122ee..e3cf2b426b 100644 --- a/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md +++ b/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md @@ -36,4 +36,4 @@ | **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0 | 是否开启在非 pipeline 并行下的 lazy inline。 | `0`:关闭 lazy inline;
`1`:开启 lazy inline。 | lazy inline 特性默认仅在 pipeline 并行模式下开启。如需在其他并行模式下使能 lazy inline,可将该环境变量设置为 1。 | | **MS_ASCEND_CHECK_OVERFLOW_MODE** | INFNAN_MODE | 设置浮点计算结果输出模式。 | `SATURATION_MODE`:饱和模式,计算出现溢出时,饱和为浮点数极值(+-MAX);
`INFNAN_MODE`:INF/NAN 模式,遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。 | 在大模型调优中,溢出状态对齐 PyTorch 方式,建议使用 INFNAN_MODE,即 export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE。
遇到持续溢出问题时可尝试设置此变量为 INFNAN_MODE。 | | **MF_LOG_SUFFIX** | NA | 设置所有 log 日志文件夹的自定义后缀。 | log 文件夹的后缀。默认值:无后缀 | 添加一致的后缀,可以隔离各个任务的日志,不会被覆写。 | -| **PLOG_REDICT_TO_OUTPUT** | False | 控制 plog 日志是否改变存储路径。 | `True`:存储到./output 目录下;
`False`: 存储到默认存储位置。 | 设置之后方便用户查询 plog 日志。 | +| **PLOG_REDIRECT_TO_OUTPUT** | False | 控制 plog 日志是否改变存储路径。 | `True`:存储到./output 目录下;
`False`: 存储到默认存储位置。 | 设置之后方便用户查询 plog 日志。 | -- Gitee From 6d81df8095e7637df4b10eacea18a2e1af7bebc2 Mon Sep 17 00:00:00 2001 From: fandawei Date: Tue, 15 Oct 2024 11:07:07 +0800 Subject: [PATCH 07/15] fix dump device_stat_precision_mode spell error --- docs/mindspore/source_zh_cn/model_train/debug/dump.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md index 2daa81993b..d402607bec 100644 --- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md +++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md @@ -175,7 +175,7 @@ MindSpore在不同模式下支持的Dump功能如下表所示: - `enable`:设置成true,表示开启同步Dump;设置成false时,采用异步Dump。不设置该字段时默认值为false,开启异步Dump。两者的区别是异步Dump对原本代码执行过程的影响更小。 - `trans_flag`:开启格式转换,将设备上的数据格式转换成NCHW格式。若为`true`,则数据会以Host侧的4D格式(NCHW)格式保存;若为`false`,则保留Device侧的数据格式。该配置参数在CPU上无效,因为CPU上没有format转换。默认值:true。 - `stat_calc_mode`:选择统计信息计算后端,可选"host"和"device"。选择"device"后可以使能device计算统计信息,当前只在Ascend生效,只支持`min/max/avg/l2norm`统计量。在op_debug_mode设置为3时,仅支持将`stat_calc_mode`设置为"host"。 - - `device_stat_precison_mode`(可选):device统计信息精度模式,可选"high"和"low"。选择"high"时,`avg/l2norm`统计量使用float32进行计算,会增加device内存占用,精度更高;为"low"时使用与原始数据相同的类型进行计算,device内存占用较少,但在处理较大数值时可能会导致统计量溢出。默认值为"high"。 + - `device_stat_precision_mode`(可选):device统计信息精度模式,可选"high"和"low"。选择"high"时,`avg/l2norm`统计量使用float32进行计算,会增加device内存占用,精度更高;为"low"时使用与原始数据相同的类型进行计算,device内存占用较少,但在处理较大数值时可能会导致统计量溢出。默认值为"high"。 - `sample_mode`(可选):设置成0,表示不开启切片dump功能;设置成1时,在图编译等级为O0或O1的情况下开启切片dump功能。仅在op_debug_mode设置为0时生效,其他场景不会开启切片dump功能。 - `sample_num`(可选):用于控制切片dump中切片的大小。默认值为100。 - `save_kernel_args`(可选): 设置成true时,会保存算子的初始化信息。仅当`enable`设置为`true`时生效。 -- Gitee From f5c23431a270eef4c4ebc801caca43302d1952a7 Mon Sep 17 00:00:00 2001 From: SaiYao Date: Tue, 15 Oct 2024 17:38:43 +0800 Subject: [PATCH 08/15] =?UTF-8?q?=E3=80=90update=E3=80=91=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E6=95=B0=E6=8D=AE=E9=9B=86=E6=96=87=E6=A1=A3=E7=9A=84?= =?UTF-8?q?=E6=8F=8F=E8=BF=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/mindformers/docs/source_en/function/dataset.md | 6 +++--- docs/mindformers/docs/source_zh_cn/function/dataset.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/mindformers/docs/source_en/function/dataset.md b/docs/mindformers/docs/source_en/function/dataset.md index 74c6140a6e..4ea2723f97 100644 --- a/docs/mindformers/docs/source_en/function/dataset.md +++ b/docs/mindformers/docs/source_en/function/dataset.md @@ -25,10 +25,10 @@ The following is an example of a MindRecord dataset based on a json format file, ```json [ { - "input_ids": "I love Beijing, because it is a city that beautifully blends rich history with modern vibrancy." + "text": "I love Beijing, because it is a city that beautifully blends rich history with modern vibrancy." }, { - "input_ids": "I love Hangzhou, because it is a city that seamlessly combines natural beauty with rich cultural heritage." + "text": "I love Hangzhou, because it is a city that seamlessly combines natural beauty with rich cultural heritage." } ] ``` @@ -69,7 +69,7 @@ The following is an example of a MindRecord dataset based on a json format file, """tokenize json file dataset""" content = [] # Read each json data and get its “input_ids”. for line in raw_data: - stripped_line = line['input_ids'].strip() + stripped_line = line['text'].strip() if stripped_line: line_ids = tokenizer(stripped_line)["input_ids"] content.append(line_ids) diff --git a/docs/mindformers/docs/source_zh_cn/function/dataset.md b/docs/mindformers/docs/source_zh_cn/function/dataset.md index 6fa4f5162c..77e9034301 100644 --- a/docs/mindformers/docs/source_zh_cn/function/dataset.md +++ b/docs/mindformers/docs/source_zh_cn/function/dataset.md @@ -25,10 +25,10 @@ MindRecord 模块提供了一些方法帮助用户将不同数据集转换为 Mi ```json [ { - "input_ids": "I love Beijing, because it is a city that beautifully blends rich history with modern vibrancy." + "text": "I love Beijing, because it is a city that beautifully blends rich history with modern vibrancy." }, { - "input_ids": "I love Hangzhou, because it is a city that seamlessly combines natural beauty with rich cultural heritage." + "text": "I love Hangzhou, because it is a city that seamlessly combines natural beauty with rich cultural heritage." } ] ``` @@ -69,7 +69,7 @@ MindRecord 模块提供了一些方法帮助用户将不同数据集转换为 Mi """tokenize json file dataset""" content = [] # 读取每个 json 数据,获取其 "input_ids" for line in raw_data: - stripped_line = line['input_ids'].strip() + stripped_line = line['text'].strip() if stripped_line: line_ids = tokenizer(stripped_line)["input_ids"] content.append(line_ids) -- Gitee From 184d0cd1e27b1720200ebf25a650198934da5dcb Mon Sep 17 00:00:00 2001 From: smallsilly <1045916357@qq.com> Date: Tue, 15 Oct 2024 11:48:39 +0800 Subject: [PATCH 09/15] =?UTF-8?q?=E3=80=90=E9=97=AE=E9=A2=98=E5=8D=95?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E3=80=91=E4=BF=AE=E5=A4=8D=E4=BD=BF=E7=94=A8?= =?UTF-8?q?MindFormers=E6=A1=86=E6=9E=B6=E6=8E=A8=E7=90=86=E5=A4=A7?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=EF=BC=88llama2-70b=EF=BC=89=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=E5=AD=98=E5=9C=A8=E7=9A=84=E9=83=A8=E5=88=86=E8=B5=84?= =?UTF-8?q?=E6=96=99=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../source_en/function/weight_conversion.md | 42 +++++++------------ .../docs/source_en/usage/inference.md | 4 +- .../function/weight_conversion.md | 42 +++++++------------ .../docs/source_zh_cn/usage/inference.md | 4 +- 4 files changed, 38 insertions(+), 54 deletions(-) diff --git a/docs/mindformers/docs/source_en/function/weight_conversion.md b/docs/mindformers/docs/source_en/function/weight_conversion.md index f4179098c6..5a12857cbc 100644 --- a/docs/mindformers/docs/source_en/function/weight_conversion.md +++ b/docs/mindformers/docs/source_en/function/weight_conversion.md @@ -36,38 +36,28 @@ python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH Assume that you have downloaded the [Llama2 model weight](https://gitee.com/mindspore/mindformers/blob/dev/docs/model_cards/llama2.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path. To convert it to the MindFormers weight and save it in the `/home/user/ms_weights` path, run the following command: ```bash -python convert_weight.py --model llama2 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt +python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt ``` After the preceding steps are performed, the HuggingFace weight is successfully converted to a MindFormers weight, facilitating model training or inference on MindFormers. ## Supported Models -- Baichuan -- BLIP -- BLOOM -- CodeGeeX2 -- CogVLM2 -- DeepSeek -- GLM -- GLM-n -- GPT -- InternLM -- InternLM2 -- knowlm -- Llama -- MAE -- Mixtral -- Qwen -- Qwen2 -- Qwen2-MoE -- Qwen-VL -- Skywork -- Swin -- TeleChat -- ViT -- WizardCoder -- Yi +| Parameter Value | Supported models | +|-----------|---------------------------------------------| +| llama | Llama2, Llama3, Llama3.1, CodeLlama | +| baichuan2 | Baichuan2 | +| glm-n | GLM2, GLM3, GLM3-32K, GLM4 | +| cogvlm2 | CogVLM2-Video, CogVLM2-Image | +| qwen | Qwen, Qwen1.5, Qwen2 | +| qwenvl | QwenVL | +| internlm | InternLM | +| internlm2 | InternLM2 | +| yi | Yi | +| mixtral | Mixtral | +| deepseek | DeepSeekCoder, DeepSeekCoder1.5, DeepSeekV2 | +| gpt | GPT2 | +| whisper | Whisper | ## Developing Weight Conversion for Unsupported Models diff --git a/docs/mindformers/docs/source_en/usage/inference.md b/docs/mindformers/docs/source_en/usage/inference.md index a855cdaef6..ecd4bed824 100644 --- a/docs/mindformers/docs/source_en/usage/inference.md +++ b/docs/mindformers/docs/source_en/usage/inference.md @@ -113,7 +113,7 @@ The inference result is as follows: ## Inference Based on the run_mindformer Script -For single-device inference, you can directly run [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/dev/run_mindformer.py). For multi-device inference, you need to run [scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/dev/scripts/msrun_launcher.sh). Take Llama2 as an example. You are advised to configure the [predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/dev/configs/llama2/predict_llama2_7b.yaml) file. +For single-device inference, you can directly run [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/dev/run_mindformer.py). For multi-device inference, you need to run [scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/dev/scripts/msrun_launcher.sh). Take Llama2 as an example. You are advised to configure the [predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/dev/configs/llama2/predict_llama2_7b.yaml) file. During inference, the vocabulary file `tokenizer.model` required for the Llama2 model will be automatically downloaded (ensuring smooth network connectivity). If the file exists locally, you can place it in the `./checkpoint_dewnload/Llama2/` directory in advance. ## Single-Device Inference @@ -128,6 +128,8 @@ python run_mindformer.py \ ## Multi-Device Inference +Executing the script will start the multi card process, and the logs will be redirected to the `./output/msrun_log` directory. Please check the log files in it. When the inference result is printed, it proves that the inference is successful. + ```shell bash scripts/msrun_launcher.sh "python run_mindformer.py \ --config configs/llama2/predict_llama2_7b.yaml \ diff --git a/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md b/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md index 5586bc75aa..15139854bd 100644 --- a/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md +++ b/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md @@ -38,38 +38,28 @@ python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH ,并保存在路径`/home/user/torch_weights`中,用户希望将其转换为MindFormers权重并保存在路径`/home/user/ms_weights`中,可以使用以下命令: ```bash -python convert_weight.py --model llama2 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt +python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt ``` 通过以上步骤,可将HuggingFace权重成功转换为MindFormers权重,方便在MindFormers中继续模型训练或推理。 ## 已支持模型 -- Baichuan -- BLIP -- BLOOM -- CodeGeeX2 -- CogVLM2 -- DeepSeek -- GLM -- GLM-n -- GPT -- InternLM -- InternLM2 -- knowlm -- Llama -- MAE -- Mixtral -- Qwen -- Qwen2 -- Qwen2-MoE -- Qwen-VL -- Skywork -- Swin -- TeleChat -- ViT -- WizardCoder -- Yi +| 参数取值 | 支持模型 | +|-----------|-------------------------------------------| +| llama | Llama2、Llama3、Llama3.1、CodeLlama | +| baichuan2 | Baichuan2 | +| glm-n | GLM2、GLM3、GLM3-32K、GLM4 | +| cogvlm2 | CogVLM2-Video、CogVLM2-Image | +| qwen | Qwen、Qwen1.5、Qwen2 | +| qwenvl | QwenVL | +| internlm | InternLM | +| internlm2 | InternLM2 | +| yi | Yi | +| mixtral | Mixtral | +| deepseek | DeepSeekCoder、DeepSeekCoder1.5、DeepSeekV2 | +| gpt | GPT2 | +| whisper | Whisper | ## 未支持模型权重转换开发 diff --git a/docs/mindformers/docs/source_zh_cn/usage/inference.md b/docs/mindformers/docs/source_zh_cn/usage/inference.md index d6aa0a104a..a56b364e4a 100644 --- a/docs/mindformers/docs/source_zh_cn/usage/inference.md +++ b/docs/mindformers/docs/source_zh_cn/usage/inference.md @@ -113,7 +113,7 @@ python pipeline_inference.py ## 基于 run_mindformer 脚本推理 -单卡推理可以直接执行[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/dev/run_mindformer.py),多卡推理需要借助 [scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/dev/scripts/msrun_launcher.sh) 启动。以 Llama2 为例,推荐配置为[predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/dev/configs/llama2/predict_llama2_7b.yaml)文件。 +单卡推理可以直接执行[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/dev/run_mindformer.py),多卡推理需要借助 [scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/dev/scripts/msrun_launcher.sh) 启动。以 Llama2 为例,推荐配置为[predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/dev/configs/llama2/predict_llama2_7b.yaml)文件。推理时会自动下载Llama2模型所需词表文件 `tokenizer.model` (需要保障网络畅通)。如果本地有这个文件,可以提前把它放在 `./checkpoint_download/llama2/` 目录下。 ## 单卡推理 @@ -128,6 +128,8 @@ python run_mindformer.py \ ## 多卡推理 +执行脚本会拉起多卡进程,日志会重定向至 `./output/msrun_log` 下,请查看其中的日志文件。当有推理结果打印,证明推理成功。 + ```shell bash scripts/msrun_launcher.sh "python run_mindformer.py \ --config configs/llama2/predict_llama2_7b.yaml \ -- Gitee From 5fdd0851bb085db0c9edca3b33ed87482de2502b Mon Sep 17 00:00:00 2001 From: smallsilly <1045916357@qq.com> Date: Tue, 15 Oct 2024 11:48:39 +0800 Subject: [PATCH 10/15] =?UTF-8?q?=E3=80=90=E9=97=AE=E9=A2=98=E5=8D=95?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E3=80=91=E4=BF=AE=E5=A4=8D=E4=BD=BF=E7=94=A8?= =?UTF-8?q?MindFormers=E6=A1=86=E6=9E=B6=E6=8E=A8=E7=90=86=E5=A4=A7?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=EF=BC=88llama2-70b=EF=BC=89=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=E5=AD=98=E5=9C=A8=E7=9A=84=E9=83=A8=E5=88=86=E8=B5=84?= =?UTF-8?q?=E6=96=99=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/mindformers/docs/source_en/usage/inference.md | 2 +- docs/mindformers/docs/source_zh_cn/usage/inference.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/mindformers/docs/source_en/usage/inference.md b/docs/mindformers/docs/source_en/usage/inference.md index ecd4bed824..052eb71186 100644 --- a/docs/mindformers/docs/source_en/usage/inference.md +++ b/docs/mindformers/docs/source_en/usage/inference.md @@ -128,7 +128,7 @@ python run_mindformer.py \ ## Multi-Device Inference -Executing the script will start the multi card process, and the logs will be redirected to the `./output/msrun_log` directory. Please check the log files in it. When the inference result is printed, it proves that the inference is successful. +Executing the script will start the multi card process, and the logs will be redirected to the `./output/msrun_log` directory. When the `text_generation_result.txt` file appears in the current directory, it proves successful inference. If the file does not appear, you can view the log file. ```shell bash scripts/msrun_launcher.sh "python run_mindformer.py \ diff --git a/docs/mindformers/docs/source_zh_cn/usage/inference.md b/docs/mindformers/docs/source_zh_cn/usage/inference.md index a56b364e4a..8aa7539d82 100644 --- a/docs/mindformers/docs/source_zh_cn/usage/inference.md +++ b/docs/mindformers/docs/source_zh_cn/usage/inference.md @@ -128,7 +128,7 @@ python run_mindformer.py \ ## 多卡推理 -执行脚本会拉起多卡进程,日志会重定向至 `./output/msrun_log` 下,请查看其中的日志文件。当有推理结果打印,证明推理成功。 +执行脚本会拉起多卡进程,日志会重定向至 `./output/msrun_log` 下。当前目录下出现 `text_generation_result.txt` 文件时,证明推理成功。若未出现该文件,可查看日志文件。 ```shell bash scripts/msrun_launcher.sh "python run_mindformer.py \ -- Gitee From 6177fda17fc52839144f1fcabc99a66764d78b75 Mon Sep 17 00:00:00 2001 From: yuhan Date: Wed, 16 Oct 2024 14:19:48 +0800 Subject: [PATCH 11/15] modify mindformers template --- .../docs/source_en/_templates/classtemplate.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/mindformers/docs/source_en/_templates/classtemplate.rst b/docs/mindformers/docs/source_en/_templates/classtemplate.rst index 258d294a6c..c4c11f4e55 100644 --- a/docs/mindformers/docs/source_en/_templates/classtemplate.rst +++ b/docs/mindformers/docs/source_en/_templates/classtemplate.rst @@ -10,6 +10,13 @@ :exclude-members: get_config_origin_mode, get_support_list, invalid_yaml_name :members: +{% elif fullname=="mindformers.modules.OpParallelConfig" %} +{{ fullname | underline }} + +.. autoclass:: {{ name }} + :exclude-members: construct, get_ulysses_cp_num, to_dict, to_diff_dict + :members: + {% elif fullname=="mindformers.AutoProcessor" %} {{ fullname | underline }} -- Gitee From 380d150a1a83c0567497551a2d433fcf68e0db12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cntuxianyu=E2=80=9D?= Date: Wed, 16 Oct 2024 10:53:15 +0800 Subject: [PATCH 12/15] add mempool_block_size explaination --- .../docs/source_en/appendix/conf_files.md | 1 + .../docs/source_zh_cn/appendix/conf_files.md | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/mindformers/docs/source_en/appendix/conf_files.md b/docs/mindformers/docs/source_en/appendix/conf_files.md index c467982099..dcc2de8fde 100644 --- a/docs/mindformers/docs/source_en/appendix/conf_files.md +++ b/docs/mindformers/docs/source_en/appendix/conf_files.md @@ -35,6 +35,7 @@ Context configuration is mainly used to specify the [mindspore.set_context](http | context.enable_graph_kernel | Enable graph fusion to optimize network execution performance, defaults to `False`. See [graph fusion](https://www.mindspore.cn/docs/en/master/model_train/optimize/graph_fusion_engine.html) for details. | bool | | context.max_call_depth | Set the maximum depth of a function call. The value must be a positive integer, and the default value is `1000`. | int | | context.max_device_memory | Set the maximum memory available to the device in the format “xxGB”, and the default value is `1024GB`. | str | +| context.mempool_block_size | Set the size of the memory pool block for devices. The format is "xxGB". Default value is `"1GB"` | str | | context.save_graphs | Save the compilation graph during execution.
1. `False` or `0` indicates that the intermediate compilation map is not saved.
2. `1` means outputting some of the intermediate files generated during the compilation of the diagram.
3. `True` or `2` indicates the generation of more backend-process-related IR files.
4. `3` indicates the generation of visualized computational diagrams and more detailed front-end IR diagrams. | bool/int | ### Model Conguration diff --git a/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md b/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md index 0ba001f93c..9b87ec0ffe 100644 --- a/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md +++ b/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md @@ -27,14 +27,15 @@ MindFormers提供的`YAML`文件中包含对于不同功能的配置项,下面 Context配置主要用于指定[mindspore.set_context](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_context.html)中的相关参数。 -| 参数 | 说明 | 类型 | -|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| context.mode | 设置后端执行模式,`0`表示GRAPH_MODE,MindFormers目前仅支持在GRAPH_MODE模式下运行 | int | -| context.device_target | 设置后端执行设备,MindFormers仅支持在`Ascend`设备上运行 | str | -| context.device_id | 设置执行设备ID,其值必须在可用设备范围内,默认值为`0` | int | -| context.enable_graph_kernel | 是否开启图算融合去优化网络执行性能, 默认值为`False`,详情可参考[图算融合](https://www.mindspore.cn/docs/zh-CN/master/model_train/optimize/graph_fusion_engine.html) | bool | -| context.max_call_depth | 设置函数调用的最大深度,其值必须为正整数,默认值为`1000` | int | -| context.max_device_memory | 设置设备可用的最大内存,格式为"xxGB",默认值为`1024GB` | str | +| 参数 | 说明 | 类型 | +| --------------------------- | ------------------------------------------------------------ | -------- | +| context.mode | 设置后端执行模式,`0`表示GRAPH_MODE,MindFormers目前仅支持在GRAPH_MODE模式下运行 | int | +| context.device_target | 设置后端执行设备,MindFormers仅支持在`Ascend`设备上运行 | str | +| context.device_id | 设置执行设备ID,其值必须在可用设备范围内,默认值为`0` | int | +| context.enable_graph_kernel | 是否开启图算融合去优化网络执行性能, 默认值为`False`,详情可参考[图算融合](https://www.mindspore.cn/docs/zh-CN/master/model_train/optimize/graph_fusion_engine.html) | bool | +| context.max_call_depth | 设置函数调用的最大深度,其值必须为正整数,默认值为`1000` | int | +| context.max_device_memory | 设置设备可用的最大内存,格式为"xxGB",默认值为`1024GB` | str | +| context.mempool_block_size | 设置内存块大小,格式为"xxGB",默认值为`1GB` | str | | context.save_graphs | 在执行过程中保存编译图。
1. `False`或`0`表示不保存中间编译图。
2. `1`表示运行时会输出图编译过程中生成的一些中间文件。
3. `True`或`2`表示生成更多后端流程相关的IR文件。
4. `3`表示生成可视化计算图和更多详细的前端IR图。 | bool/int | ### 模型配置 -- Gitee From c5b1928c7b66900d4cb857d06a859bf7863a5ca5 Mon Sep 17 00:00:00 2001 From: zouwenxiang Date: Wed, 16 Oct 2024 21:04:33 +0800 Subject: [PATCH 13/15] =?UTF-8?q?=E4=BF=9D=E7=95=99=20fa=E8=B5=B0=20BSH?= =?UTF-8?q?=E7=9A=84=E5=9B=9E=E9=80=80=E6=9C=BA=E5=88=B6=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../docs/source_en/appendix/env_variables.md | 23 ++++++++++--------- .../source_zh_cn/appendix/env_variables.md | 23 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/docs/mindformers/docs/source_en/appendix/env_variables.md b/docs/mindformers/docs/source_en/appendix/env_variables.md index 5c722a6efb..30423fc08b 100644 --- a/docs/mindformers/docs/source_en/appendix/env_variables.md +++ b/docs/mindformers/docs/source_en/appendix/env_variables.md @@ -26,14 +26,15 @@ The following environment variables are supported by MindFormers. ## Other Variables -| Variables Names | Default | Interpretations | Descriptions | Application Scenarios | -| ------------------------------- | ------ | ------------------------------------------------------ | -------------------------------- | ---------------------------------------------------------------------------------------------------------- | -| **RUN_MODE** | predict | Set the running mode. | `predict`: inference
`finetune`: Fine-tuning
`train`: Training
`eval`: Evaluation | -| **USE_ROPE_SELF_DEFINE** | true | Whether to enable ROPE fusion operator. | `true`: enable ROPE fusion operator;
`false`: disable ROPE fusion operator. | Enabling the ROPE fusion operator by default can improve the computation efficiency. Except for debugging scenarios, turn it off as needed, and generally do not make special settings. | -| **MS_ENABLE_INTERNAL_BOOST** | off | Whether to turn on the internal acceleration of the MindSpore framework. | `on`: turn on MindSpore internal acceleration;
`off`: turn off MindSpore internal acceleration. | In cases where debugging or comparing different acceleration strategies is performed, this parameter needs to be turned off to observe the impact on performance. In scenarios where efficient inference is required, it can be turned on for acceleration. | -| **MS_GE_ATOMIC_CLEAN_POLICY** | 1 | Whether to clean up the memory occupied by atomic operators in the network. | `0`: centralized cleanup of memory occupied by all atomic operators in the network;
`1`: no centralized memory cleanup, individual zeroing of each atomic operator in the network. | The switch is set to `1` by default, which makes it easy for the user to process each operator individually, allowing operations such as operator memory reuse. Setting it to `0` centrally cleans up the memory occupied by the operators. | -| **ENABLE_LAZY_INLINE** | 1 | Whether to enable lazy inline. | `0`: turn off lazy inline;
`1`: turn on lazy inline. | Available under mindspore ≥ 2.2.0. It is usually used during pipeline parallelism to improve compilation performance. It is enabled by default and can be configured to be disabled. | -| **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0 | Whether to enable lazy inline under non-pipeline parallel. | `0`: turn off lazy inline;
`1`: turn on lazy inline. |The lazy inline feature is only enabled in pipeline parallel mode by default. To enable lazy inline in other parallel modes, set this environment variable to 1. | -| **MS_ASCEND_CHECK_OVERFLOW_MODE** | INFNAN_MODE | Sets the overflow detection mode. | `SATURATION_MODE`: saturation mode, saturates to floating-point extremes (+-MAX) when the calculation overflows;
`INFNAN_MODE`: INF/NAN mode, follows the IEEE 754 standard, and outputs INF/NAN calculations as defined. | In large model tuning, the overflow state is aligned PyTorch and it is recommended to use INFNAN_MODE, i.e. export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE.
Try setting this variable to INFNAN_MODE when encountering persistent overflow problems. | -| **MF_LOG_SUFFIX** | NA | Set custom suffixes for all log log folders. | Suffix for the log folder. Default: no suffix | Adding a consistent suffix isolates logs across tasks from being overwritten. | -| **PLOG_REDIRECT_TO_OUTPUT** | False | Controls whether plog logs change storage paths. | `True`: store the logs in the ./output directory;
`False`: Store to the default storage location. | This setting makes it easier to query the plog log. | +| Variables Names | Default | Interpretations | Descriptions | Application Scenarios | +|------------------------------------|-------------|-----------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ---------------------------------------------------------------------------------------------------------- | +| **RUN_MODE** | predict | Set the running mode. | `predict`: inference
`finetune`: Fine-tuning
`train`: Training
`eval`: Evaluation | +| **USE_ROPE_SELF_DEFINE** | true | Whether to enable ROPE fusion operator. | `true`: enable ROPE fusion operator;
`false`: disable ROPE fusion operator. | Enabling the ROPE fusion operator by default can improve the computation efficiency. Except for debugging scenarios, turn it off as needed, and generally do not make special settings. | +| **MS_ENABLE_INTERNAL_BOOST** | off | Whether to turn on the internal acceleration of the MindSpore framework. | `on`: turn on MindSpore internal acceleration;
`off`: turn off MindSpore internal acceleration. | In cases where debugging or comparing different acceleration strategies is performed, this parameter needs to be turned off to observe the impact on performance. In scenarios where efficient inference is required, it can be turned on for acceleration. | +| **MS_GE_ATOMIC_CLEAN_POLICY** | 1 | Whether to clean up the memory occupied by atomic operators in the network. | `0`: centralized cleanup of memory occupied by all atomic operators in the network;
`1`: no centralized memory cleanup, individual zeroing of each atomic operator in the network. | The switch is set to `1` by default, which makes it easy for the user to process each operator individually, allowing operations such as operator memory reuse. Setting it to `0` centrally cleans up the memory occupied by the operators. | +| **ENABLE_LAZY_INLINE** | 1 | Whether to enable lazy inline. | `0`: turn off lazy inline;
`1`: turn on lazy inline. | Available under mindspore ≥ 2.2.0. It is usually used during pipeline parallelism to improve compilation performance. It is enabled by default and can be configured to be disabled. | +| **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0 | Whether to enable lazy inline under non-pipeline parallel. | `0`: turn off lazy inline;
`1`: turn on lazy inline. |The lazy inline feature is only enabled in pipeline parallel mode by default. To enable lazy inline in other parallel modes, set this environment variable to 1. | +| **MS_ASCEND_CHECK_OVERFLOW_MODE** | INFNAN_MODE | Sets the overflow detection mode. | `SATURATION_MODE`: saturation mode, saturates to floating-point extremes (+-MAX) when the calculation overflows;
`INFNAN_MODE`: INF/NAN mode, follows the IEEE 754 standard, and outputs INF/NAN calculations as defined. | In large model tuning, the overflow state is aligned PyTorch and it is recommended to use INFNAN_MODE, i.e. export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE.
Try setting this variable to INFNAN_MODE when encountering persistent overflow problems. | +| **MF_LOG_SUFFIX** | NA | Set custom suffixes for all log log folders. | Suffix for the log folder. Default: no suffix | Adding a consistent suffix isolates logs across tasks from being overwritten. | +| **PLOG_REDIRECT_TO_OUTPUT** | False | Controls whether plog logs change storage paths. | `True`: store the logs in the ./output directory;
`False`: Store to the default storage location. | This setting makes it easier to query the plog log. | +| **MS_ENABLE_FA_FLATTEN** | on | Controls whether support FlashAttention flatten optimization. | `on`: Enable FlashAttention flatten optimization;
`off`: Disable FlashAttention flatten optimization. | Provide a fallback mechanism for models that have not yet been adapted to FlashAttention flatten optimization. | diff --git a/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md b/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md index e3cf2b426b..71420d5cce 100644 --- a/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md +++ b/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md @@ -26,14 +26,15 @@ ## 其他变量 -| 变量名称 | 默认值 | 解释 | 说明 | 应用场景 | -| ---------------------------------- | ----------- | -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **RUN_MODE** | predict | 设置运行模式。 | `predict`:推理;
`finetune`:微调;
`train`:训练;
`eval`:评测。 | | -| **USE_ROPE_SELF_DEFINE** | true | 是否使用 ROPE 融合算子。 | `true`:使用 ROPE 融合算子;
`false`:不使用 ROPE 融合算子。 | 默认开启 ROPE 融合算子可以提升计算效率。除调试场景,根据需要进行关闭,一般不作特别设置。 | -| **MS_ENABLE_INTERNAL_BOOST** | off | 是否打开 MindSpore 框架的内部加速功能。 | `on`:开启 MindSpore 内部加速;
`off`:关闭 MindSpore 内部加速。 | 在进行调试或对比不同加速策略的情况下,需要关闭此参数以观察对性能的影响。在需要高效推理的场景,可以开启以进行加速。 | -| **MS_GE_ATOMIC_CLEAN_POLICY** | 1 | 是否集中清理网络中 atomic 算子占用的内存。 | `0`:集中清理网络中所有 atomic 算子占用的内存;
`1`:不集中清理内存,对网络中每一个 atomic 算子进行单独清零。 | 开关默认设置为`1`,方便用户对每个算子进行单独处理,可以进行算子内存复用等操作。设置为`0`后,集中清理算子所占内存。 | -| **ENABLE_LAZY_INLINE** | 1 | 是否开启 lazy inline。 | `0`:关闭 lazy inline;
`1`:开启 lazy inline。 | 此特性在 mindspore≥2.2.0 下适用。通常在 pipeline 并行时使用以提高编译性能。默认开启,可配置关闭。 | -| **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0 | 是否开启在非 pipeline 并行下的 lazy inline。 | `0`:关闭 lazy inline;
`1`:开启 lazy inline。 | lazy inline 特性默认仅在 pipeline 并行模式下开启。如需在其他并行模式下使能 lazy inline,可将该环境变量设置为 1。 | -| **MS_ASCEND_CHECK_OVERFLOW_MODE** | INFNAN_MODE | 设置浮点计算结果输出模式。 | `SATURATION_MODE`:饱和模式,计算出现溢出时,饱和为浮点数极值(+-MAX);
`INFNAN_MODE`:INF/NAN 模式,遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。 | 在大模型调优中,溢出状态对齐 PyTorch 方式,建议使用 INFNAN_MODE,即 export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE。
遇到持续溢出问题时可尝试设置此变量为 INFNAN_MODE。 | -| **MF_LOG_SUFFIX** | NA | 设置所有 log 日志文件夹的自定义后缀。 | log 文件夹的后缀。默认值:无后缀 | 添加一致的后缀,可以隔离各个任务的日志,不会被覆写。 | -| **PLOG_REDIRECT_TO_OUTPUT** | False | 控制 plog 日志是否改变存储路径。 | `True`:存储到./output 目录下;
`False`: 存储到默认存储位置。 | 设置之后方便用户查询 plog 日志。 | +| 变量名称 | 默认值 | 解释 | 说明 | 应用场景 | +| ---------------------------------- |-------------|------------------------------------|------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| **RUN_MODE** | predict | 设置运行模式。 | `predict`:推理;
`finetune`:微调;
`train`:训练;
`eval`:评测。 | | +| **USE_ROPE_SELF_DEFINE** | true | 是否使用 ROPE 融合算子。 | `true`:使用 ROPE 融合算子;
`false`:不使用 ROPE 融合算子。 | 默认开启 ROPE 融合算子可以提升计算效率。除调试场景,根据需要进行关闭,一般不作特别设置。 | +| **MS_ENABLE_INTERNAL_BOOST** | off | 是否打开 MindSpore 框架的内部加速功能。 | `on`:开启 MindSpore 内部加速;
`off`:关闭 MindSpore 内部加速。 | 在进行调试或对比不同加速策略的情况下,需要关闭此参数以观察对性能的影响。在需要高效推理的场景,可以开启以进行加速。 | +| **MS_GE_ATOMIC_CLEAN_POLICY** | 1 | 是否集中清理网络中 atomic 算子占用的内存。 | `0`:集中清理网络中所有 atomic 算子占用的内存;
`1`:不集中清理内存,对网络中每一个 atomic 算子进行单独清零。 | 开关默认设置为`1`,方便用户对每个算子进行单独处理,可以进行算子内存复用等操作。设置为`0`后,集中清理算子所占内存。 | +| **ENABLE_LAZY_INLINE** | 1 | 是否开启 lazy inline。 | `0`:关闭 lazy inline;
`1`:开启 lazy inline。 | 此特性在 mindspore≥2.2.0 下适用。通常在 pipeline 并行时使用以提高编译性能。默认开启,可配置关闭。 | +| **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0 | 是否开启在非 pipeline 并行下的 lazy inline。 | `0`:关闭 lazy inline;
`1`:开启 lazy inline。 | lazy inline 特性默认仅在 pipeline 并行模式下开启。如需在其他并行模式下使能 lazy inline,可将该环境变量设置为 1。 | +| **MS_ASCEND_CHECK_OVERFLOW_MODE** | INFNAN_MODE | 设置浮点计算结果输出模式。 | `SATURATION_MODE`:饱和模式,计算出现溢出时,饱和为浮点数极值(+-MAX);
`INFNAN_MODE`:INF/NAN 模式,遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。 | 在大模型调优中,溢出状态对齐 PyTorch 方式,建议使用 INFNAN_MODE,即 export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE。
遇到持续溢出问题时可尝试设置此变量为 INFNAN_MODE。 | +| **MF_LOG_SUFFIX** | NA | 设置所有 log 日志文件夹的自定义后缀。 | log 文件夹的后缀。默认值:无后缀 | 添加一致的后缀,可以隔离各个任务的日志,不会被覆写。 | +| **PLOG_REDIRECT_TO_OUTPUT** | False | 控制 plog 日志是否改变存储路径。 | `True`:存储到./output 目录下;
`False`: 存储到默认存储位置。 | 设置之后方便用户查询 plog 日志。 | +| **MS_ENABLE_FA_FLATTEN** | on | 控制 是否支持 FlashAttention flatten 优化。 | `on`:启用 FlashAttention flatten 优化;
`off`: 禁用 FlashAttention flatten 优化。 | 对于还未适配FlashAttention flatten 优化的模型提供回退机制。 | -- Gitee From 76d5e122b1c5fb92e1ecc0348da1ae91d8a5d9cb Mon Sep 17 00:00:00 2001 From: yuhan Date: Thu, 17 Oct 2024 11:42:12 +0800 Subject: [PATCH 14/15] modify urls --- docs/mindspore/source_en/model_train/debug/error_analysis.rst | 2 +- .../model_train/debug/error_analysis/error_scenario_analysis.md | 2 +- docs/mindspore/source_en/model_train/debug/pynative.md | 2 +- .../source_en/model_train/parallel/operator_parallel.md | 2 +- .../mindspore/source_zh_cn/model_train/debug/error_analysis.rst | 2 +- .../model_train/debug/error_analysis/error_scenario_analysis.md | 2 +- .../source_zh_cn/model_train/parallel/operator_parallel.md | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/mindspore/source_en/model_train/debug/error_analysis.rst b/docs/mindspore/source_en/model_train/debug/error_analysis.rst index 9b9e4698ed..a9e5c4652c 100644 --- a/docs/mindspore/source_en/model_train/debug/error_analysis.rst +++ b/docs/mindspore/source_en/model_train/debug/error_analysis.rst @@ -250,7 +250,7 @@ MindSpore provides framework developers with rich debugging tools. Debugging fea +=========================+==========================+===============================================================+=======================================================+ | Process records | Logs | used to record information at each stage of the framework | `Log-related environment variables and configurations | | | | implementation to provide information for understanding | `_ | +| | | the framework implementation process or for problem diagnosis.| /api_python/env_var_list.html>`_ | | | | | | +-------------------------+--------------------------+---------------------------------------------------------------+-------------------------------------------------------+ | Data records | RDR | Running Data Recorder (RDR) provides the ability | `Running Data Recorder | diff --git a/docs/mindspore/source_en/model_train/debug/error_analysis/error_scenario_analysis.md b/docs/mindspore/source_en/model_train/debug/error_analysis/error_scenario_analysis.md index e2540a341e..824e0b1b22 100644 --- a/docs/mindspore/source_en/model_train/debug/error_analysis/error_scenario_analysis.md +++ b/docs/mindspore/source_en/model_train/debug/error_analysis/error_scenario_analysis.md @@ -114,7 +114,7 @@ class MyStridedSlice(nn.Cell): Error cause: -The piece of code performs the slice operation on dimension 0. However, the configured policy (2,1) indicates that the slice operation is performed on both dimension 0 and dimension 1 of the input tensor. According to the description of operator slicing in the [MindSpore API](https://www.mindspore.cn/docs/en/master/note/operator_list_parallel.html), +The piece of code performs the slice operation on dimension 0. However, the configured policy (2,1) indicates that the slice operation is performed on both dimension 0 and dimension 1 of the input tensor. According to the description of operator slicing in the [MindSpore API](https://www.mindspore.cn/docs/en/master/api_python/operator_list_parallel.html), > only the mask whose value is all 0s is supported. All dimensions that are sliced must be extracted together. The input dimensions whose strides is not set to 1 cannot be sliced. diff --git a/docs/mindspore/source_en/model_train/debug/pynative.md b/docs/mindspore/source_en/model_train/debug/pynative.md index 8733f460e6..6d5b2bfea3 100644 --- a/docs/mindspore/source_en/model_train/debug/pynative.md +++ b/docs/mindspore/source_en/model_train/debug/pynative.md @@ -82,7 +82,7 @@ In the debugging process, it is often necessary to view the log to locate the pr - 3-ERROR, indicates that there is an error in the execution of the program, the error log is output, and the program may not be terminated. - 4-CRITICAL, indicates that the program execution is abnormal and will be terminated. -See [environment variables](https://www.mindspore.cn/docs/en/master/note/env_var_list.html#log) for detailed logging controls. +See [environment variables](https://www.mindspore.cn/docs/en/master/api_python/env_var_list.html#log) for detailed logging controls. ### Common PDB Debugging Commands diff --git a/docs/mindspore/source_en/model_train/parallel/operator_parallel.md b/docs/mindspore/source_en/model_train/parallel/operator_parallel.md index 1d7da32d86..34584caf5a 100644 --- a/docs/mindspore/source_en/model_train/parallel/operator_parallel.md +++ b/docs/mindspore/source_en/model_train/parallel/operator_parallel.md @@ -4,7 +4,7 @@ ## Overview -With the development of deep learning, network models are becoming larger and larger, such as trillions of parametric models have emerged in the field of NLP, and the model capacity far exceeds the memory capacity of a single device, making it impossible to train on a single card or data parallel. Operator-level parallelism is achieved by slicing the tensor involved in each operator in the network model. Logical data parallelism is used when only the data dimension is sliced, while logical model parallelism is used when only the model dimension is silced. The training of large models is enabled by reducing the memory consumption of a single device. For a list of operators that currently support parallelism, see [Usage Constraints During Operator Parallel](https://www.mindspore.cn/docs/en/master/note/operator_list_parallel.html). +With the development of deep learning, network models are becoming larger and larger, such as trillions of parametric models have emerged in the field of NLP, and the model capacity far exceeds the memory capacity of a single device, making it impossible to train on a single card or data parallel. Operator-level parallelism is achieved by slicing the tensor involved in each operator in the network model. Logical data parallelism is used when only the data dimension is sliced, while logical model parallelism is used when only the model dimension is silced. The training of large models is enabled by reducing the memory consumption of a single device. For a list of operators that currently support parallelism, see [Usage Constraints During Operator Parallel](https://www.mindspore.cn/docs/en/master/api_python/operator_list_parallel.html). > Hardware platforms supported by the operator-level parallel model include Ascend, GPU, and need to be run in Graph mode. diff --git a/docs/mindspore/source_zh_cn/model_train/debug/error_analysis.rst b/docs/mindspore/source_zh_cn/model_train/debug/error_analysis.rst index 6d9c61074e..949c4f72c7 100644 --- a/docs/mindspore/source_zh_cn/model_train/debug/error_analysis.rst +++ b/docs/mindspore/source_zh_cn/model_train/debug/error_analysis.rst @@ -275,7 +275,7 @@ MindSpore为框架开发者提供了丰富的调试手段,调试功能涵盖 | | | 的信息,为了解框架执行过程\ | `_ | | | | | | | | | | | diff --git a/docs/mindspore/source_zh_cn/model_train/debug/error_analysis/error_scenario_analysis.md b/docs/mindspore/source_zh_cn/model_train/debug/error_analysis/error_scenario_analysis.md index 4942b03470..84797647fc 100644 --- a/docs/mindspore/source_zh_cn/model_train/debug/error_analysis/error_scenario_analysis.md +++ b/docs/mindspore/source_zh_cn/model_train/debug/error_analysis/error_scenario_analysis.md @@ -114,7 +114,7 @@ class MyStridedSlice(nn.Cell): 错误原因: -这段代码在第零维度进行了取切片操作。但是配置的策略(2,1)表示分别对输入Tensor的第零维度和第一维度进行取切片操作。根据目前[MindSpore API文档](https://www.mindspore.cn/docs/zh-CN/master/note/operator_list_parallel.html)中对算子切分的说明, +这段代码在第零维度进行了取切片操作。但是配置的策略(2,1)表示分别对输入Tensor的第零维度和第一维度进行取切片操作。根据目前[MindSpore API文档](https://www.mindspore.cn/docs/zh-CN/master/api_python/operator_list_parallel.html)中对算子切分的说明, > 仅支持值为全0的mask;需要切分的维度必须全部提取;输入在strides不为1对应的维度不支持切分 diff --git a/docs/mindspore/source_zh_cn/model_train/parallel/operator_parallel.md b/docs/mindspore/source_zh_cn/model_train/parallel/operator_parallel.md index 878094040f..3840852bef 100644 --- a/docs/mindspore/source_zh_cn/model_train/parallel/operator_parallel.md +++ b/docs/mindspore/source_zh_cn/model_train/parallel/operator_parallel.md @@ -4,7 +4,7 @@ ## 概述 -随着深度学习的发展,网络模型正变得越来越大,如NLP领域已出现万亿级参数量的模型,模型容量远超单个设备的内存容量,导致单卡或数据并行均无法进行训练。算子级并行是通过将网络模型中每个算子涉及到的张量进行切分,当仅切分数据维度时,为逻辑上的数据并行,当仅切分模型维度时,为逻辑上的模型并行,通过降低单个设备的内存消耗,从而使大模型的训练成为可能。目前支持并行的算子列表,可以参考[算子级并行使用约束](https://www.mindspore.cn/docs/zh-CN/master/note/operator_list_parallel.html)。 +随着深度学习的发展,网络模型正变得越来越大,如NLP领域已出现万亿级参数量的模型,模型容量远超单个设备的内存容量,导致单卡或数据并行均无法进行训练。算子级并行是通过将网络模型中每个算子涉及到的张量进行切分,当仅切分数据维度时,为逻辑上的数据并行,当仅切分模型维度时,为逻辑上的模型并行,通过降低单个设备的内存消耗,从而使大模型的训练成为可能。目前支持并行的算子列表,可以参考[算子级并行使用约束](https://www.mindspore.cn/docs/zh-CN/master/api_python/operator_list_parallel.html)。 > 算子级并行模型支持的硬件平台包括Ascend、GPU,需要在Graph模式下运行。 -- Gitee From 0d17b4ce91fcdf4d880ee90f9b827daa33d00f5c Mon Sep 17 00:00:00 2001 From: Chenglin Jinag Date: Thu, 17 Oct 2024 15:51:02 +0800 Subject: [PATCH 15/15] Revert "add uce docs" This reverts commit 8c8c6ca70f2d2522d8bc406b0c362025cfb9c4b4. --- .../source_zh_cn/model_train/index.rst | 1 - .../train_availability/UCE_fault_recover.md | 147 ------------------ 2 files changed, 148 deletions(-) delete mode 100644 docs/mindspore/source_zh_cn/model_train/train_availability/UCE_fault_recover.md diff --git a/docs/mindspore/source_zh_cn/model_train/index.rst b/docs/mindspore/source_zh_cn/model_train/index.rst index 6bff319477..08c6337da1 100644 --- a/docs/mindspore/source_zh_cn/model_train/index.rst +++ b/docs/mindspore/source_zh_cn/model_train/index.rst @@ -91,7 +91,6 @@ train_availability/fault_recover train_availability/graceful_exit train_availability/mindio_ttp - train_availability/UCE_fault_recover train_availability/storage_sys .. toctree:: diff --git a/docs/mindspore/source_zh_cn/model_train/train_availability/UCE_fault_recover.md b/docs/mindspore/source_zh_cn/model_train/train_availability/UCE_fault_recover.md deleted file mode 100644 index b773e0a0ce..0000000000 --- a/docs/mindspore/source_zh_cn/model_train/train_availability/UCE_fault_recover.md +++ /dev/null @@ -1,147 +0,0 @@ -# UCE故障快速恢复 - -[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/model_train/train_availability/UCE_fault_recover.md) - -## 概述 - -模型并行训练过程中,可能会遇到UCE(Uncorrectable Error)故障导致训练中断。重新启动训练,各种资源的开销是巨大的。为此MindSpore提供了故障恢复的方案。使得在发生故障时,模型在故障发生处快速恢复并继续训练,无需重启训练。 - -## 用例 - -下面以一个4卡数据并行网络训练为例,介绍如何配置UCE故障快速恢复。 配置完成后,在训练中如遇到UCE故障,MindSpore和MindIO会停止所有卡的训练, 对故障卡进行清洗和修复, 从故障卡的备份卡拷贝参数到故障卡并继续训练。如果故障发生在第n个step, 那继续训练将从第n+1个step开始。 - -### 环境准备 - -开启UCE快速恢复功能需要先安装`MindIO`, 详情参见[MindIO](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc2/mindio/mindiottp/mindiottp001.html)。 - -### 准备数据 - -下载MNIST数据集,并解压数据集到项目目录。 - -```bash -wget http://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/datasets/MNIST_Data.zip -unzip MNIST_Data.zip -``` - -### 模型定义 - -开启UCE快速恢复功能需要设置TFT优化器, 在优化器更新前向MindIO TFT上报状态。用`OptTFTWrapper`来配置, 详情参见[OptTFTWrapper](https://www.mindspore.cn/docs/zh-CN/master/api_python/nn/mindspore.nn.OptTFTWrapper.html)。 - -```python - -import os -import math -import mindspore as ms -import mindspore.dataset as ds -from mindspore import nn, ops, Parameter, train -from mindspore.communication import init -from mindspore.common.initializer import initializer, HeUniform - - -ms.set_context(mode=ms.GRAPH_MODE, - jit_level='O1', - device_target="Ascend") - -ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL) -init() - -class MatMulCell(nn.Cell): - """ - MatMulCell definition. - """ - def __init__(self, param=None, shape=None): - super().__init__() - if shape is None: - shape = [28 * 28, 512] - weight_init = HeUniform(math.sqrt(5)) - self.param = Parameter(initializer(weight_init, shape), name="param") - if param is not None: - self.param = param - self.print = ops.Print() - self.matmul = ops.MatMul() - - def construct(self, x): - out = self.matmul(x, self.param) - self.print("out is:", out) - return out - - -class Network(nn.Cell): - """ - Network definition. - """ - def __init__(self): - super().__init__() - self.flatten = nn.Flatten() - self.layer1 = MatMulCell() - self.relu1 = nn.ReLU() - self.layer2 = nn.Dense(512, 512) - self.relu2 = nn.ReLU() - self.layer3 = nn.Dense(512, 10) - - def construct(self, x): - x = self.flatten(x) - x = self.layer1(x) - x = self.relu1(x) - x = self.layer2(x) - x = self.relu2(x) - logits = self.layer3(x) - return logits - -net = Network() - - -def create_dataset(batch_size): - """create dataset""" - dataset_path = os.getenv("DATA_PATH") - dataset = ds.MnistDataset(dataset_path) - image_transforms = [ - ds.vision.Rescale(1.0 / 255.0, 0), - ds.vision.Normalize(mean=(0.1307,), std=(0.3081,)), - ds.vision.HWC2CHW() - ] - label_transform = ds.transforms.TypeCast(ms.int32) - dataset = dataset.map(image_transforms, 'image') - dataset = dataset.map(label_transform, 'label') - dataset = dataset.batch(batch_size) - return dataset - -dataset = create_dataset(32) - -optimizer = nn.SGD(net.trainable_params(), 1e-2) -#配置TFT优化器 -optimizer_wrapper = nn.OptTFTWrapper(optimizer) -loss_fn = nn.CrossEntropyLoss() - -model = ms.Model(net, loss_fn=loss_fn, optimizer=optimizer_wrapper) -``` - -### Callback - -开启UCE快速恢复功能需要设置 `TFTRegister` Callback对象,并传入参数来配置,详情参见[TFTRegister](https://www.mindspore.cn/docs/zh-CN/master/api_python/train/mindspore.train.TFTRegister.html)。 - -```python -time_monitor = train.TimeMonitor(data_size=1) -loss_cb = train.LossMonitor(1) - -# 设置callback对象 -tft_cb = train.TFTRegister(0, "127.0.0.1", 30051, "./ttp_checkpoints/") - -model.train(5, dataset, callbacks=[time_monitor, loss_cb, tft_cb]) - -``` - -### 配置环境变量并启动训练 - -开启UCE故障快速恢复功能,需要设置环境变量 `MS_ENABLE_TFT='{UCE:1, TTP:1}'`。 其中 `UCE:1` 表示开启UCE快速恢复功能,`TTP:1` 表示开启临终遗言功能。 开启UCE会默认开启临终遗言功能, 如果想仅开启临终功能,可以设置环境变量 `MS_ENABLE_TFT='{UCE:0, TTP:1}'` 。此外还需要设置环境变量 `MINDIO_FOR_MINDSPORE=1`, 使能 `MindIO` 适配 MindSpore。 - -使用 `msrun` 命令启动训练。 - -```bash -export MS_ENABLE_TFT='{UCE:1 TTP:1}' -export MINDIO_FOR_MINDSPORE=1 -export DATA_PATH=${EXEC_PATH}/MNIST_DATA/train/ - -# UCE_case.py 按照上述代码创建 -msrun --worker_num=4 --local_worker_num=4 --master_port=10970 --join=False --log_dir=./uce_logs UCE_case.py -``` -- Gitee