diff --git a/huawei/ais-bench_workload/README.md b/huawei/ais-bench_workload/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b26cd20667d28a4b77260c412bc0303157b06b8e --- /dev/null +++ b/huawei/ais-bench_workload/README.md @@ -0,0 +1,47 @@ +# ais-bench-workload + +## 介绍 + +ais-bench-workload目录主要承载基于AISBench测试基准的模型负载代码以及为AISBench测试基准贡献的高易用性子工具,用于AI服务器的性能测试。 + +### AISBench场景介绍 + +AISBench标准化性能测试软件,又称AI Server Benchmark软件,是根据AI标准(IEEE 2937及 T/CESA 1169-2021)对AI服务器进行性能测试的工具软件。 + +AISBench软件包括如下2个测试场景: + +- 网络测试模式 - 适用于正式测试场景 + +```mermaid + +graph LR + subgraph stubs服务器-被测试者-厂商设备 + ais-bench-stubs -- 本地拉起 --> 负载代码 + end + subgraph tester服务器-测试者 + ais-bench-tester --网络交互通信 --> ais-bench-stubs + end + +``` + +- 本地离线测试模式 - 适用于本地裸机测试场景,不需要联网,不需要连接tester服务器 + +```mermaid + +graph LR + subgraph stubs服务器-被测试者-厂商设备 + ais-bench-stubs --本地拉起 --> 负载代码 + end + +``` + +### AISBench工具介绍 + +AISBench包括如下工具: + +| 工具名 | 工具及资料获取 | +| -------------------- | ------------------------------------------------------------ | +| AISBench测试基准工具 | 请从[人工智能系统性能基准工作组](https://www.aisbench.com/tool)获取。 | +| AISBench模型负载工具 | 训练负载:https://gitee.com/aisbench/training
推理负载:https://gitee.com/aisbench/inference
+| AISBench推理工具 | https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench | + diff --git a/huawei/ais-bench_workload/build/build.sh b/huawei/ais-bench_workload/build/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..96c28bdf923ceceb63f54e38ab7c10282f214c5e --- /dev/null +++ b/huawei/ais-bench_workload/build/build.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) +ROOT_PATH=$(readlink -f $CURDIR/../) +OUTPUT_PATH="$ROOT_PATH/output/" + +export VERSION="1.0" + +check_command_exist() +{ + command=$1 + if type $command >/dev/null 2>&1;then + return 0 + else + return 1 + fi +} + +function check_env() +{ + check_command_exist git || { echo "git running failed" ; return 1; } + return $ret_ok +} + +function get_and_check_args() +{ + # check args num is valid + [ $# -ge 4 ] || { echo "args should >=3 not valid";return 1; } + + STUBS_PACKETS=$1 + STUBS_NAME=$(basename $STUBS_PACKETS) + STUBS_SUBNAME=${STUBS_NAME%.tar.gz} + + # according type get application base dir + basedir="$2" + manufactory_group=`ls $ROOT_PATH/src/$basedir` + echo $manufactory_group | grep -wq "$3" || { echo "$3 not invliad in [$manufactory_group] return";return 1; } + MANUFACTORY=$3 + + target_group=`ls $ROOT_PATH/src/$basedir/$MANUFACTORY` + echo $target_group | grep -wq "$4" || { echo "$4 not invliad in [$target_group] return";return 1; } + TARGETDIR=$4 + + shift + shift + shift + shift + scripts_args="$@" +} + +# copy doc files to packet +copy_doc_files() +{ + local branch_args="$1" + local run_type="$2" + + [ -d $OUTPUT_BASE_DIR/code/doc/ ] || mkdir -p $OUTPUT_BASE_DIR/code/doc/ + cp $ROOT_PATH/doc/*.md $OUTPUT_BASE_DIR/code/doc/ + + [[ "$PACKET_TYPE" == "inference" ]] && { cp $OUTPUT_BASE_DIR/code/doc/ais-bench_workload_inference*.md $OUTPUT_BASE_DIR/README.md;return; } + + # train modelarts mode + [[ "$PACKET_TYPE" == "train" && "$run_type" == "modelarts" ]] && { cp $OUTPUT_BASE_DIR/code/doc/ais-bench_workload_train_modelarts*.md $OUTPUT_BASE_DIR/README.md;return; } + # default as train offline mode + cp $OUTPUT_BASE_DIR/code/doc/ais-bench_workload_train_offline*.md $OUTPUT_BASE_DIR/README.md +} + +function build_packet() +{ + get_and_check_args "$@" || { echo "get check args failed ret:$ret";return $ret_error; } + + PACKET_TYPE="$2" + + BUILD_TMP_PATH=$CURDIR/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + # untar packfile + tar xvf $STUBS_PACKETS -C $BUILD_TMP_PATH || { echo "tar file failed ret";return $ret_error; } + + # exec build.sh and cp files + TARGET_PATH=$ROOT_PATH/src/$PACKET_TYPE/$MANUFACTORY/$TARGETDIR + if [ -f $TARGET_PATH/build.sh ];then + bash $TARGET_PATH/build.sh $scripts_args || { echo "warn build target failed"; return $ret_error; } + fi + + if [ ! -d $TARGET_PATH/output ];then + echo "targetdir:$TARGET_PATH not find output return" + return $ret_error + fi + + cd $BUILD_TMP_PATH + # get untar dir + OUTPUT_BASE_DIR=`find ./ -name "Ais-Benchmark-Stubs*" -type d` + if [[ ! -d "$OUTPUT_BASE_DIR" || ! -d "$OUTPUT_BASE_DIR/code" ]];then + echo "find no path:$OUTPUT_BASE_DIR return" + return $ret_error + fi + + cp $TARGET_PATH/output/* -rf $OUTPUT_BASE_DIR/code + chmod -R u+x $OUTPUT_BASE_DIR/code + + # for stubs old versions add adapter new ais_utils.py file + if [ ! -f $OUTPUT_BASE_DIR/code/ais_utils.py ] && [ -f $OUTPUT_BASE_DIR/code/libset_result.so ];then + cp ${ROOT_PATH}/src/ais_utils_adapter.py $OUTPUT_BASE_DIR/code/ais_utils.py + fi + + copy_doc_files $scripts_args + + #PLATFORM=`uname -i` + # OUTPUT_PACKET_NAME="$PACKET_TYPE"_"$MANUFACTORY"_"$TARGETDIR-Ais-Bench-$PLATFORM-${scripts_args// /_}" + OUTPUT_PACKET_NAME="$PACKET_TYPE"_"$MANUFACTORY"_"$TARGETDIR-$STUBS_SUBNAME-${scripts_args// /_}" + rm -rf $OUTPUT_PATH/$OUTPUT_PACKET_NAME.tar.gz + mv $OUTPUT_BASE_DIR $OUTPUT_PACKET_NAME + tar -czf $OUTPUT_PATH/$OUTPUT_PACKET_NAME.tar.gz $OUTPUT_PACKET_NAME + ret=$? + if [ $ret != 0 ];then + echo "tar out packet failed ret:$ret" + return $ret_error + fi + return $ret_ok +} + +function main() +{ + [[ $1 == *"tar.gz" && -f $1 ]] || { echo "args1:$1 not valid file" ; return 1; } + + if [ "$2" != "inference" -a "$2" != "train" ];then + echo "target not valid in:[$1] not match [train inference]" + return $ret_error + fi + + [ -d $OUTPUT_PATH ] || { mkdir -p $OUTPUT_PATH; } + + target=$2 + echo "target:$target now building" + check_env || { ret=$?;echo "check env failed ret:$ret";return $ret; } + + if [ "$target" == "inference" -o "$target" == "train" ];then + build_packet "$@" || { echo "build build_inference failed:$?";return 1; } + else + echo "target:$target return" + return 1 + fi + echo "target:$target now build done" + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/build/download_and_build.sh b/huawei/ais-bench_workload/build/download_and_build.sh new file mode 100644 index 0000000000000000000000000000000000000000..012e4c3042f88aed522b49862aec9cbcc5dedf25 --- /dev/null +++ b/huawei/ais-bench_workload/build/download_and_build.sh @@ -0,0 +1,91 @@ +##!/bin/bash + +CURDIR=$(dirname $(readlink -f $0)) + +function check_command_exist() +{ + command=$1 + if type $command >/dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +check_file_valid() +{ + if [ ! -f "$1" ]; then + return 1 + fi + return 0 +} + +function try_download_stubs_packet(){ + #mkdir -p $INFER_BASE_PATH/opensource/gflags/src/ + #cmd="wget $1 --no-check-certificate -O $2" + cmd="curl -k -o $2 $1" + timeout 60 $cmd #>/dev/null 2>&1 + ret=$? + if [ "$ret" == 0 -a -s "$2" ];then + echo "download cmd:$cmd targetfile:$2 OK" + else + echo "downlaod targetfile by $cmd Failed please check network or manual download to target file" + return 1 + fi +} + +# unrar_files() +# { +# local target_file_=$1 +# local tmp_dir_=$2 +# check_command_exist start && { start winrar e $target_file_ $tmp_dir_;return 0; } +# check_command_exist unrar && { unrar e $target_file_ $tmp_dir_ ;return 0; } +# return 1; +# } + +main() +{ + local version="$1" + local type="$2" + [ "$2" != "modelarts" ] && type="" + + stubs_packet_x86_64_url="https://www.aisbench.com/assets/public/article_attachment/1/file/93fa30f8-ffac-4b99-9da6-78caa62d3df7.gz" + stubs_bak_x86_64_url="https://aisbenchtest.obs.cn-north-4.myhuaweicloud.com/stubs_package/Ais-Benchmark-Stubs-x86_64-1.0.tar.gz" # obs link + stubs_x86_64_pkg_name="Ais-Benchmark-Stubs-x86_64-1.0.tar.gz" + stubs_packet_aarch64_url="https://www.aisbench.com/assets/public/article_attachment/1/file/0796bd4e-136d-45fe-b863-24661679a283.gz" + stubs_bak_aarch64_url="https://aisbenchtest.obs.cn-north-4.myhuaweicloud.com/stubs_package/Ais-Benchmark-Stubs-aarch64-1.0.tar.gz" # obs link + stubs_aarch64_pkg_name="Ais-Benchmark-Stubs-aarch64-1.0.tar.gz" + + tmp_dir="$CURDIR/buildtmpstubs" + [ -d $tmp_dir ] && rm -rf $tmp_dir + mkdir -p $tmp_dir + + check_command_exist git || { echo "git cmd not valid"; return -1; } + check_command_exist tar || { echo "tar cmd not valid"; return -1; } + + target_file="$tmp_dir/${stubs_x86_64_pkg_name}" + try_download_stubs_packet $stubs_packet_x86_64_url $target_file || { + echo "donwload x86_64 stubs failed, using bak_url ${stubs_bak_x86_64_url}"; + try_download_stubs_packet $stubs_bak_x86_64_url $target_file || { echo "donwload x86_64 stubs failed again!";return 1; } + } + target_file="$tmp_dir/${stubs_aarch64_pkg_name}" + try_download_stubs_packet $stubs_packet_aarch64_url $target_file || { + echo "donwload aarch64 stubs failed, using bak_url ${stubs_bak_aarch64_url}"; + try_download_stubs_packet $stubs_bak_aarch64_url $target_file || { echo "donwload aarch64 stubs failed again!";return 1; } + } + + sleep 5 + x86_stubs="$tmp_dir/Ais-Benchmark-Stubs-x86_64-1.0.tar.gz" + check_file_valid "$x86_stubs" || { echo "x86_stubs:${x86_stubs} not valid path" ; return 1; } + + arm_stubs="$tmp_dir/Ais-Benchmark-Stubs-aarch64-1.0.tar.gz" + check_file_valid "$arm_stubs" || { echo "arm_stubs:${arm_stubs} not valid path" ; return 1; } + + bash -x $CURDIR/build.sh $x86_stubs train huawei train_mindspore_resnet $version $type + bash -x $CURDIR/build.sh $x86_stubs train huawei train_mindspore_bert $version $type + bash -x $CURDIR/build.sh $arm_stubs train huawei train_mindspore_resnet $version $type + bash -x $CURDIR/build.sh $arm_stubs train huawei train_mindspore_bert $version $type +} + +main "$@" +exit $? diff --git "a/huawei/ais-bench_workload/doc/ais-bench_workload_train_modelarts\350\256\255\347\273\203\350\257\264\346\230\216\346\226\207\346\241\243.md" "b/huawei/ais-bench_workload/doc/ais-bench_workload_train_modelarts\350\256\255\347\273\203\350\257\264\346\230\216\346\226\207\346\241\243.md" new file mode 100644 index 0000000000000000000000000000000000000000..b0ca1d9ee8eaba0e0eef556c14fc9609a36354c9 --- /dev/null +++ "b/huawei/ais-bench_workload/doc/ais-bench_workload_train_modelarts\350\256\255\347\273\203\350\257\264\346\230\216\346\226\207\346\241\243.md" @@ -0,0 +1,429 @@ +# ais-bench_workload_train_modelarts训练说明文档 + + + +[TOC] + +## 简介 +ais-bench标准化性能测试软件,又称AI Server Benchmark软件,是根据AI标准(IEEE 2937及 T/CESA 1169-2021)对AI服务器进行性能测试的工具软件。 + +本文主要介绍基于ais-bench软件,在ModelArts平台(线上环境)对模型进行训练性能测试。主要实现集群训练业务性能测试场景。 + +[Modelarts](https://support.huaweicloud.com/productdesc-modelarts/modelarts_01_0001.html)是面向AI开发者的一站式开发平台,提供海量数据预处理及半自动化标注、大规模分布式训练、自动化模型生成及端-边-云模型按需部署能力,帮助用户快速创建和部署模型,管理全周期AI工作流。 + +## 流程介绍 +modelarts业务启动有如下三种方式运行: + +```mermaid +graph LR +UI页面 --> Modelarts服务 +modelarts-sdk --> Modelarts服务 +modelarts-api --> Modelarts服务 +``` + +ModelArts线上训练性能测试选择modelarts-sdk作为启动方式,通过ais-bench-stubs拉起modelarts-sdk向ModelArts平台下发训练作业指令。 + + +测试操作总体流程: +如下测试流程,本测试需要一台本地运行设备,用于给modelarts服务下发训练作业。 +```mermaid +graph LR + subgraph 本地运行设备 + ais-bench-stubs --本地拉起 --> modelarts-sdk + end + subgraph Modelarts服务 + modelarts-sdk --网络启动训练作业 --> 训练作业 + end +``` + +测试过程数据传输原理图: + +```sequence +本地运行设备->>OBS存储: 上传运行代码 +本地运行设备->>ModelArts侧: 传递训练参数,拉起训练 +ModelArts侧->>OBS存储: 请求下载训练代码 +OBS存储->>ModelArts侧: 下载代码 +ModelArts侧->>OBS存储: 请求下载数据集 +OBS存储->>ModelArts侧: 下载数据集 +ModelArts侧->>ModelArts侧: 执行训练 +ModelArts侧->>OBS存储: 上传throughput/accuracy数据 +ModelArts侧->>本地运行设备: 训练完成 +本地运行设备->>OBS存储: 请求下载throughput/accuracy数据 +OBS存储->>本地运行设备: 下载数据 +``` + +测试操作总体步骤: + +1. 准备本地运行设备环境。 +2. 填写配置信息。 +3. 本地运行设备启动ais-bench-stubs程序,执行完成后获取性能数据结果。 + +## 使用前准备 + +### 环境 + +#### 本地运行设备 + +- 安装Linux系统。 +- 处于稳定的联网状态且能够与云上计算节点联网。 + +建议选择以下三种作为本地运行设备: + +- ECS云主机,可以咨询计算中心运维同事搭建启动ECS云主机。 +- Modelarts的notebook开发环境。 请参考《modelarts_notebook使用入门指导》。 +- windows上开启WSL linux子系统。请参考[官方链接。](https://docs.microsoft.com/zh-cn/windows/wsl/install) + +#### 软件依赖 + +- 安装Python3 + +- 安装easydict python程序包。 + + 请使用如下命令进行安装(当前以pip3为例,请选择与Python版本对应的的pip命令): + + ``` + pip3 install easydict + ``` + +- 安装modelarts-sdk程序包。请根据[modelarts-sdk官网教程](https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0004.html)下载对应版本并执行安装。 +- 当前适配ModelArts版本 >= 21.9.0 +### 数据集 + +下载相关模型数据集并上传至OBS存储中。 + +注意:数据集的数据量较大,需要在执行测试前下载并上传至OBS,OBS上传操作请查看具体OBS操作指导。 + +以resnet模型的imagenet数据集,bert模型的enwiki数据集为例。具体下载方式请至相关模型官网,本文不作详述。 + +由于当前OBS限定训练作业的数据输入为一个目录(示例名称:obs_url),故imagenet或enwiki数据集上传至OBS存储时的目录结构如下: + +imagenet + +```mermaid +graph LR +obs_url --> train目录 --> 训练数据集 +obs_url --> val目录 --> eval数据集 +``` + +enviki + +```mermaid +graph LR +obs_url --> train目录 --> 训练数据集 +obs_url --> val目录 --> eval数据集 +obs_url --> ms_bert_large.ckpt预训练文件 +``` + + + +### 软件包 + +请参见 《ais-bench_workload构建教程》,完成需要测试的模型对应的性能测试软件包构建。 + +#### 选择软件包 + +注意性能测试软件包会包含不同系统架构,请根据运行设备的系统架构进行选择。 + +- 比如运行设备的系统架构为x86_64架构,那么请选择xxxx_x86_64_xxx_modelarts.tar.gz软件包。 +- 比如运行设备的系统架构为aarch64架构,那么请选择xxxx_aarch64_xxx_modelarts.tar.gz软件包。 + +本文以mindspore框架r1.3版本的resnet模型运行设备aarch64环境进行举例,选择train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts.tar.gz软件包。 + +#### 解压软件包 + +登录本地运行设备,将性能测试软件包拷贝到任意目录下,执行解压操作。 + +``` +tar xvf train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts.tar.gz +``` + +软件包解压后目录结构如下: + +``` +. +├── ais-bench-stubs // 性能测试命令行工具,用于执行性能测试操作 +├── code // 性能测试代码包 +│ ├── config // 配置目录 +│ │ ├── config.sh // 离线训练性能测试配置文件 +│ │ ├── mindspore_env.sh // Mindspore框架模型测试时的环境变量,可根据实际需求补充环境变量 +│ │ ├── modelarts_config.py // 线上训练性能测试时配置 +│ │ └── tensorflow_env.sh // TensorFlow框架模型测试时的环境变量,可根据实际需求补充环境变量 +│ ├── config.json // tester服务器信息配置文件,配置后可自动将测试结果上报到tester服务器上。本地离线测试模式下不需要填写 +│ ├── doc // 指导文档存放目录 +│ └── system.json // 性能测试系统信息配置文件,仅当需要将测试结果上报到tester服务器时需要配置。本地离线测试模式下不需要填写 +├── log // 日志输出目录 +├── README.md // 测试指导文档 +├── result // 测试结果输出目录 +└── tmp // 数据缓存 +``` + + + +## 线上训练性能测试操作 + +### 文件配置 + +#### 配置modelarts_config.py + +modelarts_config.py是modelarts运行配置文件,位于性能测试软件包解压路径/code/config/modelarts_config.py,主要包括modelarts的鉴权和训练参数信息。 + +比较重要和必须要设置的参数如下: + +**access_config配置:必须填写,包含modelarts认证信息,请通过计算中心或者云服务的运维同事获取并确定。** + +**session_config配置:必须填写,包含训练作业参数信息。其中session_config是modelarts V1版本的配置,session_config_v2是modelarts V2版本的配置。** + +请根据配置文件的注释来编辑配置。 + +注意: + +1. 如果access_config.iam_endpoint access_config.obs_endpoint access_config.modelarts_endpoint三个参数需要填写,必须要设置对应的域名解析地址,该地址请咨询运维同事获取。 + + 如果本地运行设备是在ECS和Notebook中,且与modelarts服务同一网络,那么可以保证连通性,不需要设置。 + + 华为云服务不需要设置。只有计算中心才需要设置。 + +2. 当前选择的容器镜像版本是默认modelarts自带的,如果需要更新为指定的mindspore和cann版本。请参考“附录>CANN包和MindSpore软件更新”。 + +3. 训练运行参数v1版本的session_config.hyperparameters和V2版本的session_config_v2.parameters,请参考对应的模型训练启动文件的运行参数。 + +4. 注意节点配置不能跨资源池。要么使用专属资源池,要么使用公共资源池,不能一起使用。 + +5. 注意如果是多个节点,只能选择8卡设置。如果是非8卡,比如2节点1卡、2节点2卡、2节点4卡,当前modelarts不支持 + +6. modelarts配置项的详细配置方法,请参照配置文件中的注释说明 + +#### 配置config.sh + +config.sh通用负载配置文件,位于性能测试软件包解压路径/code/config/config.sh,主要包括离线训练性能测试操作的基本配置信息。 + +编辑文件: + +``` +# 必选,需设置为与当前环境匹配的Python版本 +export PYTHON_COMMAND=python3.7 +# 单服务器模式,取值为True/False,配置后各训练节点测试结果单独反馈,关闭时测试结果为各设备汇总性能结果。可选,默认关闭。 +export SINGLESERVER_MODE=True +``` +**modelarts训练版本配置** +本配置文件中增加如下配置,可将当前Modelarts运行版本配置为V2: +```bash +#modelarts version default "V1", Optional value ["V1", "V2"] +export MODELARTS_VERSION=V2 +``` +该环境变量默认是V1版本,不设置。需要执行modelarts V2版本时请显示声明该变量为"V2" + + +#### 配置config.json + +config.json tester服务器信息配置文件,位于性能测试软件包解压路径/code/config.json,主要填写ais-bench测试的tester服务器具体信息,用于在完成性能测试后将测试结果上报到tester服务器上。若无须上报测试结果,可不配置。 + +#### 配置训练配置yaml文件 +对于训练模型训练参数由yaml配置文件,用户有自定义的训练参数修改需求时,用户可以修改相关的模型训练yaml文件。 +比如B版芯片(Ascend910B)上,resnet50模型需要修改batch_size参数为240,用户可以直接修改: ++ 对于mindspore框架1.3版本,修改resnet50_imagenet2012_Acc_config.yaml ++ 对于mindspore框架1.3以上版本,修改resnet50_imagenet2012_Boost_config.yaml + +### 运行测试 + +完成配置文件配置后执行性能测试操作,本地离线测试模式(aisbench测试模式概念请参考[说明](../README.md))命令如下: + +``` +./ais-bench-stubs test +``` + +### 中断和停止训练 + ++ 云环境ModleArts界面操作。 + 在云环境ModleArts服务“训练管理 > 训练作业”界面,单击正在运行的job链接并进入。在执行job界面,单击“更多操作”按钮,激活下拉菜单,在上下文菜单中单击“停止”,即可终止运行的job。 + ++ 本地运行设备停止方法,操作如下: + + 对于modelarts V1版本: + +```bash +[root@node66 ]# ls +ais-bench-stubs code log result +[root@node66 code]# python3 ./code/common/train_modelarts.py --action stop +jobname:aisbench-debug jobid:3043 preversionid:13231 jobstatus:JOBSTAT_RUNNING stop status:{'is_success': True} +``` +该操作可以停止配置文件中job_name指示的最新一个作业版本。 +​ 对于modelarts V2版本: + +创建job成功后,本地运行设备屏幕会打印job相关信息,请搜索类似“create job sucess. job_id:c8e62b62-9529-4696-ba08-2969f4861a5d”,取"job_id:"后面部分,就是Job_id。 + +```bash +[root@node66 ]# python3 ./code/common/train_modelarts.py --action stop --modelarts_version V2 --job_id e7052953-3107-47d5-a5fa-725f9eced6e3 +stop jobid:e7052953-3107-47d5-a5fa-725f9eced6e3 sesion: +INFO:root:Successfully stop the job e7052953-3107-47d5-a5fa-725f9eced6e3 +job stop status: Terminated +``` + + + +### 结果呈现和展示 + +- 以2个节点(modelarts_config.py配置文件中train_instance_count参数配置为2)的bert r1.3 modelarts训练结果为例展示训练结果: + +```bash +report >> throughput_list:[450.77798444604605, 450.38567065252664] average:450.58182754928634 +report >> accuracy_list:[0.7138142585754395, 0.7139078378677368] average:0.7138610482215881 +2022-07-13T13:24:43 -Ais-Bench-Stubs- INFO run_eval(modelarts_run.sh:32) - run_eval called +2022-07-13T13:24:43 -Ais-Bench-Stubs- INFO get_result(modelarts_run.sh:37) - get_result called +[2022-7-13 11:27:19][INFO]get ConfigInfo testid:20210126-ljp0IY, Mode:training, Model:resnet50_v1.5, Divsion:close, Scenario:generic, test_object_type:single, tester_server_ip:127.0.0.1, tester_server_port:9527 +[2022-7-13 11:27:19][INFO]ais bench stubs begin run +[2022-7-13 11:27:19][INFO]workpath:/home/lhb/test6/train_huawei_train_mindspore_bert-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts-single-0711 go testcase. +[2022-7-13 11:27:19][INFO]Benchmanager::Init() enter +[2022-7-13 11:27:19][INFO]Transmit_server start listen 0.0.0.0 : 9990 +[2022-7-13 11:27:19][INFO]get ConfigInfo testid:20210126-ljp0IY, Mode:training, Model:resnet50_v1.5, Divsion:close, Scenario:generic, test_object_type:single, tester_server_ip:127.0.0.1, tester_server_port:9527 +[2022-7-13 11:27:19][INFO]ais bench stubs begin run +[2022-7-13 11:27:19][INFO]workpath:/home/lhb/test6/train_huawei_train_mindspore_bert-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts-single-0711 go testcase. +[2022-7-13 11:27:19][INFO]Benchmanager::Init() enter +[2022-7-13 11:27:19][INFO]Transmit_server start listen 0.0.0.0 : 9990 +[2022-7-13 13:24:48][INFO]train_result_info: { + "accuracy" : "0.7138610482215881", + "average_power" : 0, + "dataload_end_time" : "2020-01-30 14:16:00", + "dataload_start_time" : "2020-01-30 14:16:00", + "efficientcy" : 0, + "energy_consumption" : 0, + "max_power" : 0, + "prepare_end_time" : "2020-01-30 14:16:00", + "prepare_start_time" : "2020-01-30 14:16:00", + "proc_end_time" : "2020-01-30 14:16:00", + "proc_start_time" : "2020-01-30 14:16:00", + "resource_util_ratio" : 0, + "throughput_ratio" : "450.58182754928634", + "total_end_time" : "2022-07-13 13:24:43", + "total_start_time" : "2022-07-13 11:27:19" +} + +[2022-7-13 13:24:48][INFO]Transmit_server resource is released! +[2022-7-13 13:24:51][INFO]BenchManager stop done +``` + +- 2个节点的resnet r1.3 modelarts训练结果为例展示训练结果: + +```bash +report >> throughput_list:[14147.314993295107, 14155.048461692913] average:14151.181727494011 +report >> accuracy_list:[0.7705078125, 0.7707316080729166] average:0.7706197102864583 +2022-07-12T15:29:13 -Ais-Bench-Stubs- INFO run_eval(modelarts_run.sh:32) - run_eval called +2022-07-12T15:29:13 -Ais-Bench-Stubs- INFO get_result(modelarts_run.sh:37) - get_result called +[2022-7-12 12:19:43][INFO]get ConfigInfo testid:20210126-ljp0IY, Mode:training, Model:resnet50_v1.5, Divsion:close, Scenario:generic, test_object_type:single, tester_server_ip:127.0.0.1, tester_server_port:9527 +[2022-7-12 12:19:43][INFO]ais bench stubs begin run +[2022-7-12 12:19:43][INFO]workpath:/home/lhb/test6/train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts-single-0712 go testcase. +[2022-7-12 12:19:43][INFO]Benchmanager::Init() enter +[2022-7-12 12:19:43][INFO]Transmit_server start listen 0.0.0.0 : 9990 +[2022-7-12 12:19:43][INFO]get ConfigInfo testid:20210126-ljp0IY, Mode:training, Model:resnet50_v1.5, Divsion:close, Scenario:generic, test_object_type:single, tester_server_ip:127.0.0.1, tester_server_port:9527 +[2022-7-12 12:19:43][INFO]ais bench stubs begin run +[2022-7-12 12:19:43][INFO]workpath:/home/lhb/test6/train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts-single-0712 go testcase. +[2022-7-12 12:19:43][INFO]Benchmanager::Init() enter +[2022-7-12 12:19:43][INFO]Transmit_server start listen 0.0.0.0 : 9990 +[2022-7-12 15:29:18][INFO]train_result_info: { + "accuracy" : "0.7706197102864583", + "average_power" : 0, + "dataload_end_time" : "2020-01-30 14:16:00", + "dataload_start_time" : "2020-01-30 14:16:00", + "efficientcy" : 0, + "energy_consumption" : 0, + "max_power" : 0, + "prepare_end_time" : "2020-01-30 14:16:00", + "prepare_start_time" : "2020-01-30 14:16:00", + "proc_end_time" : "2020-01-30 14:16:00", + "proc_start_time" : "2020-01-30 14:16:00", + "resource_util_ratio" : 0, + "throughput_ratio" : "14151.181727494011", + "total_end_time" : "2022-07-12 15:29:13", + "total_start_time" : "2022-07-12 12:19:43" +} + +[2022-7-12 15:29:18][INFO]Transmit_server resource is released! +[2022-7-12 15:29:21][INFO]BenchManager stop done +``` + +### 训练作业日志说明 + +ModleArts训练作业日志可以通过以下方式查看: + +- ModleArts界面 +- OBS输出日志路径 +- ais-bench-stubs测试运行过程中,性能测试软件包解压路径/log目录中定时更新训练作业的日志信息文件 + +## 附录 + +### CANN包和MindSpore软件更新 + +如果当前测试需要更新CANN包,执行如下操作: + +1. 在性能测试软件包解压路径创建run目录,若已存在run目录,则删除run目录下的文件。 + + 示例文件如下: + + ``` + [root@node66 run]# tree -L 1 + . + ├── Ascend-cann-nnae_5.0.2.1_linux-aarch64.run + ├── mindspore_ascend-1.3.0-cp37-cp37m-linux_aarch64.whl + └── protobuf-3.20.1-cp37-cp37m-linux_aarch64.whl + ``` + + + +2. 在训练主程序.py文件同级目录(性能测试软件包解压路径/code/code)增加ma-pre-start.sh脚本。 + + ma-pre-start.sh文件内容如下: + + ``` + #!/bin/bash + set -x + echo "Start to intall the run package" + LOCAL_DIR=$(cd "$(dirname "$0")";pwd) + echo $LOCAL_DIR + + TRAIN_PY_PATH=$(readlink -f `find ./ -name train.py`) + BASE_PATH=`dirname $TRAIN_PY_PATH` + + pip install $BASE_PATH/run/protobuf*.whl + pip install $BASE_PATH/run/mindspore_ascend*.whl + echo "replace origin mindspore packet!!! done ret:$? !!!" + + sudo chmod +x $BASE_PATH/run/*.run + CANN_RUN_PACKET=`find $BASE_PATH/run/ -name Ascend-cann-nnae*.run` + sudo $CANN_RUN_PACKET --upgrade + echo "replace origin CANN_RUN_PACKET!!!: $CANN_RUN_PACKET done ret:$? !!!" + + # env set + export GLOG_v=3 + export ASCEND_GLOBAL_LOG_LEVEL=3 + export ASCEND_GLOBAL_EVENT_ENABLE=0 + export ASCEND_SLOG_PRINT_TO_STDOUT=0 + + set +x + ``` + + 其中 CANN_RUN_PACKET参数-name请根据实际需要安装的CANN包名称设置为:Ascend-cann-nnae、Ascend-cann-nnrt、Ascend-cann-toolkit。 + +### 日志级别设置 + +通过修改ma-pre-start.sh文件中“GLOG_v”或“ASCEND_GLOBAL_LOG_LEVEL”的变量值,可以更新日志的级别。 + ++ GLOG日志级别取值为:0(INFO)、1(WARNING)、2(ERROR)、3(FATAL) ++ ASCEND_GLOBAL_LOG_LEVEL日志级别取值为:0(DEBUG)、1(INFO)、2(WARNING)、3(ERROR)、4(NULL) + +### 域名解析地址增加 + +请咨询ModelArts所在云环境的运维,获取该云相关服务(obs、modelarts、swr)域名和IP的映射关系并写入/etc/hosts, + +比如武汉云相关服务obs、modelarts、swr域名映射关系如下: + +```bash +58.48.42.196 obs.cn-central-221.ovaijisuan.com +58.48.42.193 modelarts.cn-central-221.ovaijisuan.com +58.48.42.198 swr.cn-central-221.ovaijisuan.com +``` + +注意: + +- 如果在notebook中运行,无须设置该项。 +- 华为云无须设置。 + diff --git "a/huawei/ais-bench_workload/doc/ais-bench_workload_train_offline\347\272\277\344\270\213\350\256\255\347\273\203\350\257\264\346\230\216\346\226\207\346\241\243.md" "b/huawei/ais-bench_workload/doc/ais-bench_workload_train_offline\347\272\277\344\270\213\350\256\255\347\273\203\350\257\264\346\230\216\346\226\207\346\241\243.md" new file mode 100644 index 0000000000000000000000000000000000000000..c1dba070a3ed476a22f8ffc8691b04f9744079a2 --- /dev/null +++ "b/huawei/ais-bench_workload/doc/ais-bench_workload_train_offline\347\272\277\344\270\213\350\256\255\347\273\203\350\257\264\346\230\216\346\226\207\346\241\243.md" @@ -0,0 +1,225 @@ +# ais-bench_workload_train_offline线下训练说明文档 + + + +[TOC] + +## 简介 + +ais-bench标准化性能测试软件,又称AI Server Benchmark软件,是根据AI标准(IEEE 2937及 T/CESA 1169-2021)对AI服务器进行性能测试的工具软件。 + +本文主要介绍基于ais-bench软件,在离线环境对模型进行训练性能测试。离线环境是指非modelarts云上训练场景,当前主要适配单卡、单机、线下集群、容器集群场景。 + +## 使用前准备 + +### 环境 + +1. Atals训练设备(搭载Ascend NPU以及Ascend 910芯片等昇腾硬件环境),可以搭建单卡、单机、线下集群、容器集群场景,相关硬件产品文档请参见[昇腾硬件产品文档](https://www.hiascend.com/document?data=hardware)。 +2. 根据需要测试的模型类型安装MindSpore或TensorFlow框架;参见《[CANN 软件安装指南](https://www.hiascend.com/document/detail/zh/canncommercial/51RC1/envdeployment/instg/instg_000002.html)》安装CANN软件包。MindSpore或TensorFlow框架需要根据《ais-bench_workload构建教程》所选择的模型版本来安装对应版本的框架。 +3. 集群测试时需要安装依赖软件--sshpass,版本无要求。 +4. 容器环境测试时,容器制作请参照《制作可ssh登录镜像ascend-mindspore-arm的方法》 + +### 数据集 + +下载相关模型数据集到运行设备任意目录下。例如resnet模型需要imagenet数据集,bert模型需要enwiki数据集。具体下载方式请至相关模型官网,本文不作详述。 + +### 软件包 + +请参见 《ais-bench_workload构建教程》,完成需要测试的模型对应的性能测试软件包构建。 + +#### 选择软件包 + +注意性能测试软件包会包含不同系统架构,请根据运行设备的系统架构进行选择。 + +- 比如运行设备的系统架构为x86_64架构,那么请选择xxxx_x86_64.tar.gz软件包。 +- 比如运行设备的系统架构为aarch64架构,那么请选择xxxx_aarch64_xxx.tar.gz软件包。 + +本文以mindspore框架r1.3版本的resnet模型运行设备aarch64环境进行举例,选择train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.3.tar.gz软件包。 + +#### 解压软件包 + +登录运行设备,将性能测试软件包拷贝到任意目录下,执行解压操作。 + +``` +tar xvf train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.3_modelarts.tar.gz +``` + +软件包解压后目录结构如下: + +``` +. +├── ais-bench-stubs // 性能测试命令行工具,用于执行性能测试操作 +├── code // 性能测试代码包 +│ ├── config // 配置目录 +│ │ ├── config.sh // 离线训练性能测试配置文件 +│ │ ├── mindspore_env.sh // Mindspore框架模型测试时的环境变量,可根据实际需求补充环境变量 +│ │ ├── modelarts_config.py // 线上训练性能测试时配置 +│ │ └── tensorflow_env.sh // TensorFlow框架模型测试时的环境变量,可根据实际需求补充环境变量 +│   ├── config.json // tester服务器信息配置文件,配置后可自动将测试结果上报到tester服务器上。本地离线测试模式下不需要填写 +│   ├── doc // 指导文档存放目录 +│   └── system.json // 性能测试系统信息配置文件,仅当需要将测试结果上报到tester服务器时需要配置。本地离线测试模式下不需要填写 +├── log // 日志输出目录 +├── README.md // 离线性能测试指导 +└── result // 测试结果输出目录 +``` + + + +## 离线训练性能测试操作 + +### 文件配置 + +#### 配置config.sh + +config.sh通用负载配置文件,位于性能测试软件包解压路径/code/config/config.sh,主要包括离线训练性能测试操作的基本配置信息。 + +请在配置文件中根据注释说明填写. + + +**注意** + +1. 非单卡环境下,必须要生成rank_table文件并配置RANK_TABLE_FILE变量。rank_table文件生成请参见“rank_table文件生成与实例”。 +2. 集群环境下,必须要生成节点ssh信息文件并配置NODEINFO_FILE变量。节点ssh信息文件生成请参见“节点ssh信息文件”。 +3. 集群环境下,直接使用ssh信息文件进行节点间的登录交互可能存在安全风险,可以设置集群节点的秘钥认证,提高安全性。请参见“集群节点免密设置”。 + +#### 配置config.json + +config.json tester服务器信息配置文件,位于性能测试软件包解压路径/code/config.json,主要填写ais-bench测试的tester服务器具体信息,用于在完成性能测试后将测试结果上报到tester服务器上。若无须上报测试结果,可不配置。 + +#### 配置system.json + +system.json 性能测试系统信息配置文件,位于性能测试软件包解压路径/code/system.json,主要填写ais-bench测试的运行环境系统信息,用于在完成性能测试后将运行环境系统信息作为测试结果的内容上报到tester服务器上。若无须上报测试结果,可不配置。 + +#### 配置训练配置yaml文件 +对于训练模型训练参数由yaml配置文件,用户有自定义的训练参数修改需求时,用户可以修改相关的模型训练yaml文件。 +比如B版芯片(Ascend910B)上,resnet50模型需要修改batch_size参数为240,用户可以直接修改: ++ 对于mindspore框架1.3版本,修改resnet50_imagenet2012_Acc_config.yaml ++ 对于mindspore框架1.3以上版本,修改resnet50_imagenet2012_Boost_config.yaml + +### 运行测试 + +完成配置文件配置后执行性能测试操作,本地测试命令如下: + +``` +./ais-bench-stubs test +``` + +连接tester服务器测试时,无需test参数。 + +## 附录 + +### **日志级别设置** + +性能测试启动后,默认在性能测试软件包解压路径/log目录下输出日志。 + +如果需要设置日志级别,请在性能测试软件包解压路径/config目录下的mindspore_env.sh或tensorflow_env.sh文件中添加如下环境变量。 + +``` +export GLOG_v=3 +``` + +GLOG日志级别取值为:0(INFO)、1(WARNING)、2(ERROR)、3(FATAL)。 + +### rank_table文件生成与实例 + +单机或集群rank_table文件生成方法,请单击[芯片资源信息配置文件参考](https://support.huawei.com/enterprise/zh/doc/EDOC1100192402/a1885ca4)访问相关文档。 + +生成双机16卡的rank_table文件示例:rank_table_16p_64_66.json + +```bash +{ + "version": "1.0", + "server_count": "2", + "server_list": [ + { + "server_id": "xx.xx.xx.xx", + "device": [ + {"device_id": "0", "device_ip": "xx.xx.xx.xx", "rank_id": "0"}, + {"device_id": "1", "device_ip": "xx.xx.xx.xx", "rank_id": "1"}, + {"device_id": "2", "device_ip": "xx.xx.xx.xx", "rank_id": "2"}, + {"device_id": "3", "device_ip": "xx.xx.xx.xx", "rank_id": "3"}, + {"device_id": "4", "device_ip": "xx.xx.xx.xx", "rank_id": "4"}, + {"device_id": "5", "device_ip": "xx.xx.xx.xx", "rank_id": "5"}, + {"device_id": "6", "device_ip": "xx.xx.xx.xx", "rank_id": "6"}, + {"device_id": "7", "device_ip": "xx.xx.xx.xx", "rank_id": "7"} + ], + "host_nic_ip": "reserve" + }, + { + "server_id": "xx.xx.xx.xx", + "device": [ + {"device_id": "0", "device_ip": "xx.xx.xx.xx", "rank_id": "8"}, + {"device_id": "1", "device_ip": "xx.xx.xx.xx", "rank_id": "9"}, + {"device_id": "2", "device_ip": "xx.xx.xx.xx", "rank_id": "10"}, + {"device_id": "3", "device_ip": "xx.xx.xx.xx", "rank_id": "11"}, + {"device_id": "4", "device_ip": "xx.xx.xx.xx", "rank_id": "12"}, + {"device_id": "5", "device_ip": "xx.xx.xx.xx", "rank_id": "13"}, + {"device_id": "6", "device_ip": "xx.xx.xx.xx", "rank_id": "14"}, + {"device_id": "7", "device_ip": "xx.xx.xx.xx", "rank_id": "15"} + ], + "host_nic_ip": "reserve" + } + ], + "status": "completed" +} +``` + +### 节点ssh信息文件 + +集群场景下执行性能测试时,需要设置节点ssh信息文件,用于在测试过程中节点之间的登录验证。 + +节点ssh信息文件由用户自行创建文件名格式类似于ssh64_66.json,按照如下示例格式进行配置。 + +示例:ssh64_66.json + +```bash +{ + "cluster": { + "xx.xx.xx.xx": { # 节点IP,必须与rank_table文件中的server_id一一对应 + "user": "xxxx", # 节点登录用户名,完成集群节点免密设置可不配置 + "pd": "xxxx", # 节点登录密码,完成集群节点免密设置可不配置 + "port": xx # 容器端口,默认22。可不设置。删除本参数或配置为22时,表示性能测试在默认22端口通信;设置具体端口号时,表示在容器或者设备中运行并提供指定端口访问能力 + + }, + "xx.xx.xx.xx": { + "user": "xxxx", + "pd": "xxxx", + "port": xx + } + } +} +``` + +**注意:该文件中的节点数目应与rank_table文件中的节点数目一致。** + +### 集群节点免密设置 + +集群节点免密设置的参考操作如下: + +1. 登录集群管理节点并生成SSH Key。 + + ``` + ssh-keygen -t rsa -b 2048 + ``` + + 安全起见,建议用户到“Enter passphrase”步骤时输入密钥密码,且符合密码复杂度要求。建议执行该命令前先将umask设置为0077,测试完成后再恢复原来umask值。 + +2. 将管理节点的公钥拷贝到所有节点的机器上。 + + ``` + ssh-copy-id -i ~/.ssh/id_rsa.pub @ + ``` + + @替换成要拷贝到的对应节点的用户名和IP。 + +3. 设置ssh代理管理ssh密钥。 + + ``` + ssh-agent bash # 开启ssh-agent的bash进程 + ssh-add # 向ssh-agent添加私钥 + ``` + + 避免工具批量安装操作过程中输入密钥密码和节点密码。 + + + diff --git "a/huawei/ais-bench_workload/doc/ais-bench_workload\346\216\250\347\220\206\346\211\247\350\241\214\345\256\271\345\231\250\347\216\257\345\242\203\346\220\255\345\273\272\346\214\207\345\257\274.md" "b/huawei/ais-bench_workload/doc/ais-bench_workload\346\216\250\347\220\206\346\211\247\350\241\214\345\256\271\345\231\250\347\216\257\345\242\203\346\220\255\345\273\272\346\214\207\345\257\274.md" new file mode 100644 index 0000000000000000000000000000000000000000..01e4a70352b6672932ae9230d790f02a6590e1fb --- /dev/null +++ "b/huawei/ais-bench_workload/doc/ais-bench_workload\346\216\250\347\220\206\346\211\247\350\241\214\345\256\271\345\231\250\347\216\257\345\242\203\346\220\255\345\273\272\346\214\207\345\257\274.md" @@ -0,0 +1,160 @@ +# ais-bench_workload推理执行容器环境搭建指导 + +## 1. 简介 + +本文基于华为昇腾镜像仓库推理基准镜像algorithm增加相关命令,用于构建容器环境,用于Ais-Bench推理负载包运行。 + +## 2. 下载基础镜像 + +本文基于昇腾镜像仓库algorithm基础镜像制作,链接为 (https://ascendhub.huawei.com/#/detail/algorithm) 请进入点击 "获取镜像"按钮,下载基础镜像。 + +本文示例基于华为昇腾基础镜像algorithm 22.0.RC1, CANN 5.1.RC1。镜像名称,22.0.RC1-ubuntu18.04。该镜像已经安装nnrt和python3.7.5。该类镜像有多个发行平台都可以作为基础镜像。 + +拉取基础镜像的方法请参照**附录1**进行。 + +下载完毕后,请按该官网"如何使用镜像"来配置物理机环境。其步骤3,可以参考附录2。 + +## 3.基于基准镜像构建新镜像 + +### 3.1 准备依赖程序包 + +工作目录algorithm文件如下: + +目录文件树 + +``` +root@root:/home/lhb/tool/algorithm# tree +. +├── aclruntime-0.0.1-cp37-cp37m-linux_aarch64.whl +├── Dockerfile +└── loadgen-0.0.1-cp37-cp37m-linux_aarch64.whl +``` + +说明: + +- aclruntime-0.0.1-cp37-cp37m-linux_aarch64.whl, 来自于ais-bench_workload发行包aclruntime-aarch64.tar.gz +- loadgen-0.0.1-cp37-cp37m-linux_aarch64.whl, 来自于ais-bench_workload发行包Ais-Bench-LoadGen-aarch64-1.1.tar.gz + +### 3.2 创建Dockerfile + +```bash +FROM ascendhub.huawei.com/public-ascendhub/algorithm:22.0.RC1-ubuntu18.04 + +MAINTAINER liangchaoming + +USER root +RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak +# apt源加速 +RUN sed -i "s@/archive.ubuntu.com/@/mirrors.163.com/@g" /etc/apt/sources.list && rm -rf /var/lib/apt/lists/* && apt-get update --fix-missing -o Acquire::http::No-Cache=True +# 恢复基础容器中python3.7.5在容器中的指向.容器支持python3、python3.7.5的python调用 +RUN mkdir -p /opt/package +RUN ln -s /usr/local/python37/bin/python3 /usr/bin/python3.7.5 && ln -s /usr/local/python37/bin/pip3 /usr/bin/pip3.7.5 + +# 安装系统依赖 +RUN apt-get install libglib2.0-dev libgl1-mesa-glx -y && apt install vim -y +# 安装pip3依赖。 +RUN pip3 install numpy tqdm pycocotools scikit-learn transformers tokenization opencv_python -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com +WORKDIR /opt/package +# 安装推理运行环境aclruntime和loadgen +COPY aclruntime-0.0.1-cp37-cp37m-linux_aarch64.whl /opt/package/aclruntime-0.0.1-cp37-cp37m-linux_aarch64.whl +COPY loadgen-0.0.1-cp37-cp37m-linux_aarch64.whl /opt/package/loadgen-0.0.1-cp37-cp37m-linux_aarch64.whl +RUN pip3 install aclruntime-0.0.1-cp37-cp37m-linux_aarch64.whl +RUN pip3 install loadgen-0.0.1-cp37-cp37m-linux_aarch64.whl +# online install tensorflow1.15 +RUN pip3 install https://files.pythonhosted.org/packages/35/9a/985a1642bc493b96c340da6db366124b2609c7a42ca53f643585c01e4d33/tensorflow_ascend-1.15.0-cp37-cp37m-manylinux2014_aarch64.whl +CMD ["source", "/usr/local/Ascend/nnrt/set_env.sh"] +``` + +说明: + +如果需要网络代理,请在Dockerfile增加网络代理环境变量。在“USER root"下一行,增加类似如下环境变量: + +```bash +# 设置网络代理环境变量 +ENV http_proxy "http://xxxx:xxxx" +ENV https_proxy "http://xxxx:xxxx" +ENV ftp_proxy "http://xxxx:xxxx" +``` + + + +### 3.3 构建镜像 + +编译指令: + +``` +docker build -t ais-bench_workload-inference-arm:1.0 . +``` + +执行成功后,回显如下: + +``` +root@root:/home/lhb/tool/algorithm# docker images +REPOSITORY TAG IMAGE ID CREATED SIZE +ais-bench_workload-inference-arm 1.0 d16debfbde9c About an hour ago 2.83GB +``` + +## 3. 运行镜像 + +``` +root@root:/home/lhb/tool/algorithm# docker run -itd -u root -v /home/:/home --name asend_inference_aarch64 -e ASCEND_VISIBLE_DEVICES=0 52dbef81d817 /bin/bash +a1ed8ed48256f7ece143c986fd337aefb29f50a307f342cee1e88881618eb29a +root@root:/home/lhb/tool/algorithm# docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +a1ed8ed48256 52dbef81d817 "/bin/bash" 4 seconds ago Up 3 seconds asend_inference_aarch64 +root@root:~# docker exec -it a1ed8ed48256 bash +root@a1ed8ed48256:/opt/package# +``` + +说明: + +- -d参数,后台执行。退出容器后容器还存在着 + +- 以root用户登录将物理机的/home目录映射到容器的/home目录 + +- 52dbef81d817是推理镜像ID + +- A500小站驱动等部署较为特殊,其容器启动命令有其特殊性 + + ```bash + docker run -itd -u root --device=/dev/davinci0 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /usr/local/bin/npu-smi/:/usr/local/bin/npu-smi/ -v /home/data/miniD/driver/lib64/:/home/data/miniD/driver/lib64/ -v /home/:/home/ -e ASCEND_VISIBLE_DEVICES=0 52dbef81d817 /bin/bash + ``` + + 容器拉起后,进入容器中,执行以下语句声明下系统库最新路径 + + ``` + expert LD_LIBRARY_PATH=/home/data/miniD/driver/lib64/:$LD_LIBRARY_PATH + ``` + + A500环境资源有限,请及时清理docker资源保证推理测试顺利进行 + +## 4. 附录 + +### 4.1 修改linux用户HwHiAiUser的id值为1000 + +背景:物理机当前HwHiAiUser的id是1001,zjut-msadvisor用户的id是1000 + +步骤: + +```bash +root@root:~# usermod -u 1003 zjut-msadvisor +usermod: user zjut-msadvisor is currently used by process 91928 +root@root:~# ps -ef |grep 91928 +zjut-ms+ 91928 1 0 12:33 ? 00:00:00 /lib/systemd/systemd --user +zjut-ms+ 91929 91928 0 12:33 ? 00:00:00 (sd-pam) +root 94128 92493 0 12:49 pts/0 00:00:00 grep --color=auto 91928 +root@root:~# kill -9 91928 91929 +root@root:~# usermod -u 1003 zjut-msadvisor +usermod: user zjut-msadvisor is currently used by process 92042 +root@root:~# kill -9 92042 +root@root:~# usermod -u 1003 zjut-msadvisor +root@root:~# groupmod -g 1003 zjut-msadvisor +root@root:~# id zjut-msadvisor +uid=1003(zjut-msadvisor) gid=1003(zjut-msadvisor) groups=1003(zjut-msadvisor) +root@root:~# usermod -u 1000 HwHiAiUser +root@root:~# groupmod -g 1000 HwHiAiUser +root@root:~# id HwHiAiUser +uid=1000(HwHiAiUser) gid=1000(HwHiAiUser) groups=1000(HwHiAiUser) +``` + +说明:新id--1003不得与现有用户id重复 diff --git "a/huawei/ais-bench_workload/doc/ais-bench_workload\346\236\204\345\273\272\346\225\231\347\250\213.md" "b/huawei/ais-bench_workload/doc/ais-bench_workload\346\236\204\345\273\272\346\225\231\347\250\213.md" new file mode 100644 index 0000000000000000000000000000000000000000..f36cb61d069945ff6b7d75be0d593c704becebec --- /dev/null +++ "b/huawei/ais-bench_workload/doc/ais-bench_workload\346\236\204\345\273\272\346\225\231\347\250\213.md" @@ -0,0 +1,235 @@ +# ais-bench-workload构建教程 + +## 概述 +ais-bench标准化性能测试软件,又称AI Server Benchmark软件,是根据AI标准(IEEE 2937及T/CESA 1169-2021)对AI服务器进行性能测试的工具软件。 + +ais-bench_workload是ais-bench提供用于构建ais-bench性能测试软件包并进行测试的负载工具。 + +本文档主要介绍如何**搭建ais-bench_workload构建环境**并在该环境下**构建训练场景的ais-bench性能测试软件包。** +ais-bench_workload支持快速构建和标准构建。其中快速构建仅支持mindspore框架的bert和resnet两个典型模型的性能测试软件包构建。通过构建脚本扩展,标准构建能支持当前提供的所有模型的性能测试软件包构建。 + +## 1. 搭建ais-bench_workload构建环境 +### 1.1 环境要求 + +ais-bench_workload构建支持在Windows和Linux系统下进行,要求如下: + +- **Windows系统**:Windows7及以上版本;安装git和winrar,版本不限。 + +- **Linux系统**:系统版本无限制;安装git和unrar,版本不限。 + + 其中git、winrar和unrar的下载与安装,请用户自行完成,本文不详细描述。 + +### 1.2 源码下载 +ais-bench_workload的工作目录保存在ais-bench源码包的tools目录下,可以通过以下两种方式下载ais-bench源码包: + ++ git clone下载[AISBench/training](https://gitee.com/aisbench/training)仓库代码 +``` + git clone https://gitee.com/aisbench/training.git +``` +​ 该方式直接下载码云training仓库master分支源码。 + ++ 在线下载源码压缩包 + 访问AISBench/training仓库网页:https://gitee.com/aisbench/training , 点击“克隆/下载”按钮,在弹出的窗口中点击“下载ZIP”按钮进行下载。 + + 该方式默认下载的是master分支training源码包为压缩包training-master.zip,解压后training目录默认名字是training-master 。 + +### 1.3 工作目录 + +​ 获取源码包后,需要在ais-bench_workload工作目录下执行构建操作。ais-bench_workload工作目录为training目录下的子目录,目录结构如下: + +仅展示构建需要部分 + +```bash +ais-bench_workload +├── build +│   ├── build.sh # 标准构建脚本 +│   └── download_and_build.sh # 快速构建脚本 +├── doc +│   ├── ais_bench推理程序更新说明.md +│   ├── ais-bench_workload_inference推理负载说明文档.md +│   ├── ais-bench_workload_train_modelarts训练说明文档.md +│   ├── ais-bench_workload_train_offline线下训练说明文档.md +│   ├── ais-bench_workload构建教程.md +│   ├── ais-bench_workload推理执行容器环境搭建指导.md +│   ├── modelarts_notebook使用入门指导.docx +│   └── 制作可ssh登录镜像ascend-mindspore-arm的方法.md +├── README.md +├── src # 构建测试软件包的模型保存目录 +│   └── train # 训练场景 +│   ├── huawei # 华为模型 +│   │   ├── train_mindspore_bert +│   │   ├── train_mindspore_deeplabv3 +│   │   ├── train_mindspore_deepspeech2 +│   │   ├── train_mindspore_faster_rcnn +│   │   ├── train_mindspore_glm2 +│   │   ├── train_mindspore_gnmt_v2 +│   │   ├── train_mindspore_llama +│   │   ├── train_mindspore_pangu_alpha +│   │   ├── train_mindspore_resnet +│   │   ├── train_mindspore_ssd +│   │   ├── train_mindspore_widedeep +│   │   ├── train_tensorflow_bert_base +│   │   ├── train_tensorflow_densenet121 +│   │   ├── train_tensorflow_mobilenetv2 +│   │   ├── train_tensorflow_nezha_large +│   │   ├── train_tensorflow_resnet101 +│   │   ├── train_tensorflow_resnet50 +│   │   ├── train_tensorflow_resnext50 +│   │   ├── train_tensorflow_ssd_resnet34 +│   │   ├── train_tensorflow_vgg16 +│   │   └── train_tensorflow_yolov3 +│   └── nvidia # 英伟达模型 +│   ├── train_tensorflow_bert +│   └── train_tensorflow_resnet +``` + + + +## 2. 构建ais-bench性能测试软件包 +### 2.1 快速构建(仅支持AISBench 1.0版本) + +#### 2.1.1 简介 + +快速构建可以一键构建mindspore框架的bert&resnet典型模型分别在aarch64和x86_64平台训练场景的ais-bench性能测试软件包。对于其它模型的训练软件包的构建,需要通过标准构建获取。 + +快速构建也可以通过扩展快速构建脚本downlaod_and_build.sh,将其它模型加入快速构建中,实现对其它模型测试软件包的快速构建。这要求熟悉Python和Shell语言并对快速构建脚本有一定的了解。 + +#### 2.1.2 约束 + +支持构建环境: + +- **Windows系统**:git bash--Mircrosoft Windows git命令的模拟终端。 + +- **Linux系统**: + + 要求操作系统处于稳定的联网状态,主要保证能够顺利下载ais-bench stubs基础测试工具包,可以先执行如下命令测试网络是否畅通: + + ```bash + curl http://www.aipubservice.com + ``` + +不建议多用户同时执行快速构建操作,可能出现依赖下载失败。 + +#### 2.1.3 构建指令 + +指令格式:bash ./download_and_build.sh {version} {type} +参数说明: + +| 参数 | 说明 | +| --------- | ------------------------------------------------------------ | +| {version} | 框架版本号,必选。取值需通过ais-bench_workload\src目录的具体模型目录下的版本文件确认支持的版本号。快速构建仅支持bert和resnet模型。故可配置的版本号为:{type}参数为“modelarts”时,可配置为r1.3、r1.5、r1.7、r1.8、r1.9、r1.10、r2.0、r2.1、r2.2;未配置{type}参数时可配置为r1.5、r1.6、r1.7、r1.8、r1.9、r1.10、r2.0、r2.1、r2.2。 | +| {type} | 线上或离线环境,可选。取值为“modelarts”,表示构建线上环境的性能测试软件包;不配置本参数时,表示构建离线环境的性能测试软件包。 | + +#### 2.1.4 构建操作 + +示例,构建基于mindspsore 1.7版本线上执行bert和resnet模型的性能测试软件包,指令如下: + +``` +bash ./download_and_build.sh r1.7 modelarts +``` + +Windows系统需预先安装git软件。在ais-bench_workload工作目录下鼠标右键上下文菜单中点击"git bash here",打开Microsoft Windows git命令行模拟终端,执行构建指令。鼠标右键没有git相关菜单命令时,请在windows右下角的搜索窗口输入"git" ,找到git bash,并点击进入,执行构建指令。 + +#### 2.1.5 构建结果 +构建指令成功执行后,在ais-bench_workload目录下生成output子目录,构建的性能测试软件包保存在该子目录中。 +构建结果示例: + +``` +. +├── train_huawei_train_mindspore_bert-Ais-Benchmark-Stubs-aarch64-1.0-r1.7_modelarts.tar.gz +├── train_huawei_train_mindspore_bert-Ais-Benchmark-Stubs-x86_64-1.0-r1.7_modelarts.tar.gz +├── train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-aarch64-1.0-r1.7_modelarts.tar.gz +└── train_huawei_train_mindspore_resnet-Ais-Benchmark-Stubs-x86_64-1.0-r1.7_modelarts.tar.gz +``` + +分别生成aarch64和x86_64平台下bert和resnet模型共4个软件包。 + +### 2.2 标准构建 + +#### 2.2.1 简介 + +标准构建是通过构建指令指定具体模型执行构建性能测试软件包的操作,相比快速构建,指令更丰富。 + +#### 2.2 约束 + +- 仅支持在Linux系统下执行构建操作。 + +- 要求操作系统处于稳定的联网状态,主要是保证能够顺利下载ais-bench stubs基础测试工具包。可以先执行如下命令测试网络是否畅通: + + ```bash + curl http://www.aipubservice.com + ``` + +- 一次执行只能构建一个模型的性能测试软件包。 + +#### 2.2.3 构建准备 + +进行标准构建前,需要先下载ais-bench stubs基础测试工具包并将该工具解压到ais-bench_workload工作目录的build子目录下。 + +ais-bench stubs基础测试工具包用于选择构建测试软件包适用的aarch64和x86_64平台。 + +访问[面向人工智能基础技术及应用的检验检测基础服务平台](http://www.aipubservice.com/#/show/compliance/detail/127), 通过“成果展示”->“标准符合性测试”->“人工智能服务器系统性能测试”, 进入“人工智能服务器系统性能测试”页面,在“测试工具”章节下载Stubs压缩包到本地,将Stubs压缩包解压到ais-bench_workload/build目录下。 + +执行操作后,ais-bench_workload/build目录结构如下: + +``` +ais-bench_workload +├── build +    ├── build.sh +    ├── download_and_build.sh + ├── Ais-Benchmark-Stubs-aarch64-.tar.gz + └── Ais-Benchmark-Stubs-x86_64-.tar.gz +``` + +为软件包版本号。 + +#### 2.2.4 构建指令 + +指令格式:./build.sh {$stubs_file} {mode} {secondary-folder-name} {third-folder-name} {version} {type} +输出路径:在ais-bench_workload\output目录会生成相应程序包。 + +| 参数 | 说明 | +| ----------------------- | ------------------------------------------------------------ | +| {stubs_file} | 选择stubs基础工具包,即选择构建测试软件包使用的aarch64和x86_64平台,必选。取值为Ais-Benchmark-Stubs-aarch64-.tar.gz、Ais-Benchmark-Stubs-x86_64-.tar.gz | +| {mode} | 选择构建测试软件包的适用场景,必选。取值为:train(训练场景)。对应ais-bench_workload/src目下以及子目录名称。 | +| {secondary-folder-name} | 二级子目录名称,对应ais-bench_workload/src目录下二级子目录名称,必选。{mode}配置为train时,表示选择模型品牌,取值为:huawei、nvidia | +| {third-folder-name} | 三级子目录名称,对应ais-bench_workload/src目录下三级子目录名称,必选。
{secondary-folder-name}配置为language时,取值为bert;
{secondary-folder-name}配置为vision时,取值为classification_and_detection
{secondary-folder-name}配置为huawei时,取值为:train_mindspore_bert、train_mindspore_deeplabv3、train_mindspore_deepspeech2、train_mindspore_faster_rcnn、train_mindspore_glm2、train_mindspore_gnmt_v2、train_mindspore_llama、train-mindspore_pangu_alpha、train_mindspore_resnet、train_mindspore_ssd、train_mindspore_widedeep、train_tensorflow_bert_base、train_tensorflow_densenet121、train_tensorflow_mobileneetv2、train_tensorflow_nezha_large、train_tensorflow_resnet50、train_tensorflow_resnet101、train_tensorflow_resnext50、train_tensorflow_ssd_resnet34、train_tensorflow_vgg16、train_tensorflow_yolov3;
{secondary-folder-name}配置为nvidia时,取值为train_tensorflow_bert、train_tensorflow_resnet | +| {version} | 模型框架版本号,仅{mode}配置为train时支持,可选。取值需通过ais-bench_worload工作目录具体模型目录下的版本文件确认支持的模型框架版本号 | +| {type} | 线上或离线环境,仅{mode}配置为train时支持,可选。取值为“modelarts",表示构建线上环境的性能测试软件包;不配置本参数时,表示构建离线环境的性能测试软件包 | + + + +#### 2.2.5 构建操作 + +**训练场景示例如下:** + ++ 构建aarch64架构训练场景huawei mindspore框架r1.7版本resnet模型 离线运行的性能测试软件包 + ./build.sh ./Ais-Benchmark-Stubs-aarch64-1.0.tar.gz train huawei train_mindspore_resnet r1.7 ++ 构建aarch64架构训练场景huawei mindspore框架r1.7版本resnet模型modelarts线上运行的性能测试软件包 + ./build.sh ./Ais-Benchmark-Stubs-aarch64-1.0.tar.gz train huawei train_mindspore_resnet r1.7 modelarts + + +## FAQ + +### 1. Linux环境下执行构建报错:$'\r': command not found + +**故障现象** + +在Linux环境执行./build.sh构建操作时会出现如下报错: + +![faq1](../img/faq1.png) + +**故障原因** + +build.sh脚本是从Windows环境开发的,Windows的换行符格式与Linux不一致,导致在Linux环境执行报错。 + +**故障处理** + +在Ascend/tools/ais-bench-workload目录下执行如下命令将脚本进行格式化后重新执行构建操作: + +``` +find src/train/huawei/train_mindspore_glm2 -type f -name "*.sh" -exec dos2unix {} + +``` + + + diff --git "a/huawei/ais-bench_workload/doc/modelarts_notebook\344\275\277\347\224\250\345\205\245\351\227\250\346\214\207\345\257\274.docx" "b/huawei/ais-bench_workload/doc/modelarts_notebook\344\275\277\347\224\250\345\205\245\351\227\250\346\214\207\345\257\274.docx" new file mode 100644 index 0000000000000000000000000000000000000000..17a932eab792359f780d7b1d3cb942a3869b718f Binary files /dev/null and "b/huawei/ais-bench_workload/doc/modelarts_notebook\344\275\277\347\224\250\345\205\245\351\227\250\346\214\207\345\257\274.docx" differ diff --git "a/huawei/ais-bench_workload/doc/\345\210\266\344\275\234\345\217\257ssh\347\231\273\345\275\225\351\225\234\345\203\217ascend-mindspore-arm\347\232\204\346\226\271\346\263\225.md" "b/huawei/ais-bench_workload/doc/\345\210\266\344\275\234\345\217\257ssh\347\231\273\345\275\225\351\225\234\345\203\217ascend-mindspore-arm\347\232\204\346\226\271\346\263\225.md" new file mode 100644 index 0000000000000000000000000000000000000000..ebb9bf0d58f403159d8e3b6051f878cd3bf43b74 --- /dev/null +++ "b/huawei/ais-bench_workload/doc/\345\210\266\344\275\234\345\217\257ssh\347\231\273\345\275\225\351\225\234\345\203\217ascend-mindspore-arm\347\232\204\346\226\271\346\263\225.md" @@ -0,0 +1,163 @@ + + + + + + +# 制作可ssh登录镜像ascend-mindspore-arm的方法 + + +本文均以Dockerfile实现镜像说明。 +## 1.获取基础镜像 +### 1.1 华为开源arm镜像 + +华为开源镜像库网址:https://ascendhub.huawei.com/#/index +以下以基础镜像ascend-mindspore-arm为例,集成mindspore r1.5来说明: + +- 基础镜像ascend-mindspore-arm网址:https://ascendhub.huawei.com/#/detail/ascend-mindspore-arm。 ubuntu18.04系统 +- 登录基础镜像网址,点击“获取镜像” +- 在随后出现的Uniportal帐号登录界面,选择账号/邮箱登录、手机号码登录、短信登录三种方式之一,登入。如果网页出现“禁止”字样,请更换登录方式。建议“短信登录”方法登录。 +- 版本界面,选择版本“21.0.1.spc001”, 点击下载列表对应的“立即下载”,进入下载界面 +- 下载界面会显示下载步骤,请按步骤执行。 + +​示例: + +​获取登录访问权限并复制到工作节点执行: + +``` +docker login -u WX926930 -p 4u9xchG5IzMuGgVFxvvMVH895SwE0tIXAQrBwl0C46uHzhMwYEq5eWV0EvYbG7CdO ascendhub.huawei.com +``` + +​下载镜像: + +``` +docker pull ascendhub.huawei.com/public-ascendhub/ascend-mindspore-arm:21.0.1.spc001 +``` + +- 在工作节点查询镜像: + +``` +(base) root@node62:/home/lhb/code/ascend-mindspore-arm_ssh# docker images |grep ascend-mindspore-arm +ascendhub.huawei.com/public-ascendhub/ascend-mindspore-arm 21.0.1.spc001 67bcd3733d57 5 weeks ago 6.67GB +(base) root@node62:/home/lhb/code/ascend-mindspore-arm_ssh# +``` +### 1.2 自定义欧拉镜像 +以下以EulerOS 2.0(SP8)平台ascend-mindspore-arm-base:1.0基础镜像为例说明: ++ 镜像功能内部集成训练通用的第三方库(系统包、pip3、openssh_server), 未集成MindSpore框架、ascend toolkit工具包 ++ 以下仅以实现ssh登录说明,请自行部署业务驱动和工具包 +## 2.目标镜像制作 +工作目录ascend-mindspore-arm_ssh + +### 2.1 镜像相关文件准备 +#### 2.1.1 华为开源arm镜像 + +目录文件如下: + +- ​Ascend-cann-toolkit_5.0.3_linux-aarch64.run 请自行下载 +- ​ Dockerfile。内容如2.2.1 小节所示 +- ​ 容器启动run_container.sh脚本 + +​内容: + +``` +docker run -it --ipc=host --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ -v /var/log/npu/:/usr/slog -v /home/:/home -p 8000:22 ascend-mindspore-arm:ms1.5 bash -c "/etc/init.d/ssh start && /bin/bash" +``` + +说明:-p 8000:22 表示外部端口8000映射容器22端口,提供外部ssh访问能力 +#### 2.1.2 自定义欧拉镜像 +目录文件如下: +- Dockerfile 内容如2.2.2小节所示。 +- sshpass-1.06.tar.gz 点[这里](https://nchc.dl.sourceforge.net/project/sshpass/sshpass/1.06/sshpass-1.06.tar.gz)下载 +- run_container.sh容器拉起脚本 +内容: +``` +docker run -it --ipc=host --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ -v /var/log/npu/:/usr/slog -v /home/:/home -p 8000:22 ascend-mindspore-arm-base:ms1.5 bash -c "/usr/sbin/sshd -D && /bin/bash" +``` + +### 2.2 创建Dockerfile +#### 2.2.1 华为开源arm镜像 +工作目录创建名字为Dockerfile文件,内容如下: + +```bash +FROM ascendhub.huawei.com/public-ascendhub/ascend-mindspore-arm:21.0.1.spc001 +MAINTAINER liangchaoming +RUN apt-get update \ + && /usr/bin/python3.7 -m pip install --upgrade pip \ + && apt-get install libnuma-dev openssh-server apt-utils sshpass -y \ + && /usr/local/Ascend/nnae/latest/script/uninstall.sh +ADD Ascend-cann-toolkit_5.0.3_linux-aarch64.run /opt/packet/Ascend-cann-toolkit_5.0.3_linux-aarch64.run +RUN /opt/packet/Ascend-cann-toolkit_5.0.3_linux-aarch64.run --full \ + && pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/1.5.0/MindSpore/ascend/aarch64/mindspore_ascend-1.5.0-cp37-cp37m-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.tuna.tsinghua.edu.cn/simple \ + && echo 'root:root'|chpasswd \ + && mkdir -p /var/run/sshd \ + && sed -i 's/.*PermitRootLogin.*/PermitRootLogin yes/g' /etc/ssh/sshd_config +EXPOSE 22 +CMD ["/usr/sbin/sshd", "-D"] +``` +#### 2.2.2 自定义欧拉镜像 + +```bash +FROM ascend-mindspore-base:1.0 +MAINTAINER liangchaoming +COPY sshpass-1.0.6.tar.gz /opt/packet/sshpass-1.0.6.tar.gz +RUN cd /opt/packet/ \ + && tar -xvzf sshpass-1.0.6.tar.gz \ + && cd sshpass-1.0.6 \ + && ./configure --prefix=/usr/local/ \ + && make && make install +RUN echo 'root:root'|chpasswd \ + && mkdir -p /var/run/sshd \ + && sed -i 's/.*PermitRootLogin.*/PermitRootLogin yes/g' /etc/ssh/sshd_config +EXPOSE 22 +``` + +### 2.3 编译镜像 + +- 创建指令: + +``` +docker build -t ascend-mindspore-arm-base:ms1.5 . +``` + +注意:指令末尾的".",表示使用当前目录的Dockfile。tag--ms1.5, 请自己决定,这里仅仅示意 + +- 查询当前节点镜像列表: + +``` +(base) root@node64:/home/lhb/test2# docker images |grep ascend-mindspore-arm +ascend-mindspore-arm-base ms1.5 2454f44b88ee 5 hours ago 12.1GB +``` + +- 执行./run_container.sh创建工作容器,并查询容器状态 + +``` +(base) root@node64:/home/lhb/test2# docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +0f6f9971a646 ascend-mindspore-arm-base:ms1.5 "bash -c '/etc/init.…" 3 hours ago Up 3 hours 0.0.0.0:8000->22/tcp compassionate_cerf +``` +## 3. 拉起容器 +执行命令:bash run_container.sh +## 4. ssh登录验证 + +在其它节点执行 ssh {user}@{IP} -p 8000 + +示例:ssh {user}@{IP} -p 8000 {password} + +执行结果: + +``` +[root@node66 ~]# ssh {user}@{IP} -p 8000 +{usr}@{IP}'s password: +Welcome to Ubuntu 18.04.5 LTS (GNU/Linux 4.15.0-29-generic aarch64) + + * Documentation: https://help.ubuntu.com + * Management: https://landscape.canonical.com + * Support: https://ubuntu.com/advantage +This system has been minimized by removing packages and content that are +not required on a system that users do not log into. + +To restore this content, you can run the 'unminimize' command. +Last login: Thu Nov 18 09:10:30 2021 from {IP} +root@0f6f9971a646:~# +``` + diff --git "a/huawei/ais-bench_workload/doc/\345\256\211\345\205\250\345\243\260\346\230\216.md" "b/huawei/ais-bench_workload/doc/\345\256\211\345\205\250\345\243\260\346\230\216.md" new file mode 100644 index 0000000000000000000000000000000000000000..edc35f4d50c29afcb92532ba60c72383f4204f11 --- /dev/null +++ "b/huawei/ais-bench_workload/doc/\345\256\211\345\205\250\345\243\260\346\230\216.md" @@ -0,0 +1,10 @@ +# 安全声明 + +## 通信矩阵 + +ais-bench_workload通信矩阵 + +| 序号 | 功能 | 源设备 | 源IP | 源端口 | 目的设备 | 目的IP | 目的端口
(侦听) | 协议 | 端口说明 | 端口配置 | 侦听端口是否可更改 | 认证方式 | 加密方式 | 所属平面 | 版本 | 特殊场景 | 备注 | +|:----|:-----------|:------------------|:---------------------|:------|:-------------------|:---------------------|:--------------|:-----------|:-------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------|:-----|:-----|:-------|:-----------------------|:-----|:---| +| 1 | ais-bench_workload集群构建的多节点之间进行文件传输 | 集群中各个节点 | 集群中服务器的ip | 由用户配置(默认使用22) | 集群中各个节点 | 集群中服务器的ip | 由用户配置(默认使用22) | ssh | 传输节点间的文件 | 不涉及 | 不涉及 | ssh协议 | ssh协议 | 业务面 | 所有版本 | 无 | | + diff --git a/huawei/ais-bench_workload/img/faq1.png b/huawei/ais-bench_workload/img/faq1.png new file mode 100644 index 0000000000000000000000000000000000000000..e473985c0bdef03c836e34e27fdade56728223d6 Binary files /dev/null and b/huawei/ais-bench_workload/img/faq1.png differ diff --git a/huawei/ais-bench_workload/src/ais_utils_adapter.py b/huawei/ais-bench_workload/src/ais_utils_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..23e885894660997f3a18f036d71b87b2795cc085 --- /dev/null +++ b/huawei/ais-bench_workload/src/ais_utils_adapter.py @@ -0,0 +1,28 @@ +import sys +import time +import set_result as old_set_result + +def calc_throughput_rate(nums, start_time, end_time): + if start_time == end_time: + return 0 + else: + return nums/(end_time - start_time) + +def calc_lantency(elapsedtime, count): + latency = 0 if count == 0 else elapsedtime/count + return latency + +def get_datatime(): + return time.time() + +def set_result(mode, key, value): + old_set_result.set_result(mode, key, value) + +if __name__ == '__main__': + fun_name = sys.argv[1] + + if fun_name == "set_result": + mode = sys.argv[2] + key = sys.argv[3] + value = sys.argv[4] + set_result(mode, key, value) diff --git a/huawei/ais-bench_workload/src/common/calc_glm2_result.py b/huawei/ais-bench_workload/src/common/calc_glm2_result.py new file mode 100644 index 0000000000000000000000000000000000000000..47db5f41aa68b7c9037b04111fbc14cafa865a0a --- /dev/null +++ b/huawei/ais-bench_workload/src/common/calc_glm2_result.py @@ -0,0 +1,46 @@ +import json +import os +import sys +import ais_utils + +RESULT_PATH = sys.argv[1] +RANK_SIZE = sys.argv[2] +RUN_MODE = sys.argv[3] + +total_throughput = 0 +accuracy = 0 +merged_json_data = {} +# get all json data +for rank_id in range(int(RANK_SIZE)): + json_path = os.path.join(RESULT_PATH, f"result_rank_{rank_id}.json") + if not os.path.exists(json_path): + raise FileExistsError("{} file not exist".format(json_path)) + else: + with open(json_path, "r") as file: + json_data = json.load(file) + if not merged_json_data: + merged_json_data = json_data + for mode_key, mode_value in json_data.items(): + for param_key, param_value in mode_value.items(): + merged_json_data[mode_key][param_key].extend(param_value) + +# sort all json data +for mode_key, mode_value in merged_json_data.items(): + for param_key, param_value in mode_value.items(): + merged_json_data[mode_key][param_key].sort() + +def set_result_single(mode:str): + for key, value in merged_json_data[mode].items(): + if key == "throughput_ratio": + ais_utils.set_result("training", key, sum(value)) + elif "start" in key: + ais_utils.set_result("training", key, value[0]) + elif "end" in key: + ais_utils.set_result("training", key, value[-1]) + + +if RUN_MODE == "only_finetune": + set_result_single("train") +else: + raise RuntimeError(f"not supported run mode :{RUN_MODE}") +ais_utils.set_result("training", "result", "OK") \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/calc_llm_result.py b/huawei/ais-bench_workload/src/common/calc_llm_result.py new file mode 100644 index 0000000000000000000000000000000000000000..db4695e5d8c8c3b8d56b03817d4e1292f457899b --- /dev/null +++ b/huawei/ais-bench_workload/src/common/calc_llm_result.py @@ -0,0 +1,55 @@ +import json +import os +import sys +import ais_utils + +RESULT_PATH = sys.argv[1] +RANK_SIZE = sys.argv[2] +RUN_MODE = sys.argv[3] + +total_throughput = 0 +accuracy = 0 +merged_json_data = {} +# get all json data +for rank_id in range(int(RANK_SIZE)): + json_path = os.path.join(RESULT_PATH, f"result_rank_{rank_id}.json") + if not os.path.exists(json_path): + raise FileExistsError("{} file not exist".format(json_path)) + else: + with open(json_path, "r") as file: + json_data = json.load(file) + if not merged_json_data: + merged_json_data = json_data + continue + for mode_key, mode_value in json_data.items(): + for param_key, param_value in mode_value.items(): + merged_json_data[mode_key][param_key].extend(param_value) + +# sort all json data +for mode_key, mode_value in merged_json_data.items(): + for param_key, param_value in mode_value.items(): + merged_json_data[mode_key][param_key].sort() + +def set_result_single(mode:str): + for key, value in merged_json_data[mode].items(): + if key == "throughput_ratio": + ais_utils.set_result("training", key, sum(value)) + elif "start" in key: + ais_utils.set_result("training", key, value[0]) + elif "end" in key: + ais_utils.set_result("training", key, value[-1]) + + +def set_result_full(): + pass + + +if RUN_MODE == "only_pretrain": + set_result_single("train") +elif RUN_MODE == "only_finetune": + set_result_single("finetune") +elif RUN_MODE == "full": + set_result_full() +else: + raise RuntimeError(f"not supported run mode :{RUN_MODE}") +ais_utils.set_result("training", "result", "OK") \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/calc_power.sh b/huawei/ais-bench_workload/src/common/calc_power.sh new file mode 100644 index 0000000000000000000000000000000000000000..d0988a89f6c66bbd665bef161502d62d1bae4913 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/calc_power.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# need to specify ip user password + +SUCCESS=0 +FAIL=1 + +function get_power() +{ + result=`ipmitool -H ${BMC_IP} -I lanplus -U ${BMC_USER} -P ${BMC_PASSWORD} raw 0x30 0x93 0xdb 0x07 0x00 0x11 0x00` + first=`echo ${result} | awk '{print $1}'` + except='db' + if [ "${except}" != "${first}" ];then + echo "ERROR:ipmitool run fail,result=${result}" + BMC_POWER=0 + return ${FAIL} + fi + high=`echo ${result} | awk '{print $5}'` + low=`echo ${result} | awk '{print $4}'` + power_str="${high}${low}" + power=$((16#${power_str})) + BMC_POWER=${power} + #echo `date`" power:${BMC_POWER}" + return ${SUCCESS} +} + +function calculate_emptyload_power() +{ + max_num=$1 + num_i=0 + + # clear + BMC_POWER=0 + # MAX_POWER=0 + SUM_POWER=0 + + while [[ num_i -lt max_num ]]; do + get_power + if [ $? -eq ${FAIL} ];then + echo "ERROR:get power fail, get_power exit" + return ${FAIL} + fi + # if [ "${BMC_POWER}" -gt "${MAX_POWER}" ];then + # MAX_POWER=${BMC_POWER} + # fi + SUM_POWER=$((10#${SUM_POWER}+${BMC_POWER})) + let num_i=num_i+1 + sleep 6 + done + export EMPTYLOAD_AVERAGE_POWER=$((10#${SUM_POWER}/${max_num})) +} + +exit_trap() +{ + trap - USR2 + RUNING_AVERAGE_POWER=$((10#${RUNING_SUM_POWER}/${loop_count})) + + AVERAGE_POWER=$(python -c "import ais_utils; print(ais_utils.calc_single_avg_power( $EMPTYLOAD_AVERAGE_POWER, $RUNING_AVERAGE_POWER))") + MAX_POWER=$(python -c "import ais_utils; print(ais_utils.calc_single_max_power(${EMPTYLOAD_AVERAGE_POWER}, ${RUNING_MAX_POWER}))") + + python $CUR_PATH/ais_utils.py set_result "training" "average_power" ${AVERAGE_POWER} + python $CUR_PATH/ais_utils.py set_result "training" "max_power" ${MAX_POWER} + + echo "RUNING_AVERAGE_POWER:$RUNING_AVERAGE_POWER EMPTYLOAD_AVERAGE_POWER:$EMPTYLOAD_AVERAGE_POWER AVERAGE_POWER:$AVERAGE_POWER MAX_POWER:$MAX_POWER" + + echo "exit end $$ $loopflag loop end" +} + +function calculate_runing_power() +{ + trap exit_trap USR2 + loop_count=0 + + # clear + BMC_POWER=0 + RUNING_MAX_POWER=0 + RUNING_SUM_POWER=0 + + while [[ true ]]; do + get_power + if [ $? -eq ${FAIL} ];then + echo "ERROR:get power fail, get_power exit" + return ${FAIL} + fi + if [ "${BMC_POWER}" -gt "${RUNING_MAX_POWER}" ];then + RUNING_MAX_POWER=${BMC_POWER} + fi + RUNING_SUM_POWER=$((10#${RUNING_SUM_POWER}+${BMC_POWER})) + let loop_count=loop_count+1 + sleep 6 + done + echo "calc end" +} + +check_command_exist() +{ + command=$1 + if type $command >/dev/null 2>&1;then + return 0 + else + return 1 + fi +} + +function calc_powerinfo_backgroud() +{ + check_command_exist "ipmitool" + if [[ $? -ne 0 || -z "$BMC_IP" || -z "$BMC_USER" || -z "$BMC_PASSWORD" ]];then + echo "not valid power env ret" + return 0 + fi + timeout 4 ping -c3 -i1 $BMC_IP >> /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "not valid bmc_ip" + return 0 + fi + calculate_emptyload_power 3 + + calculate_runing_power & + export power_monitor_pid=$! + echo "power monitor pid:$power_monitor_pid" +} + +function set_powerinfo() +{ + if [ ! -z "$power_monitor_pid" ];then + kill -12 $power_monitor_pid + sleep 10 + echo "send signel sleep done. power_monitor_pid: $power_monitor_pid" + kill -9 $power_monitor_pid + fi +} + +# main() +# { +# export BMC_IP="xx.xx.xx.xx" +# export BMC_USER="xxxx" +# export BMC_PASSWORD="xxxx" +# export BMC_PASSWORD="" + +# calc_powerinfo_backgroud +# echo "get power ret:$?" + +# sleep 10 +# echo "sleep done now calc and kill " +# set_powerinfo +# echo "set power power ret:$?" +# } + +# main $@ diff --git a/huawei/ais-bench_workload/src/common/calc_resourceinfo.sh b/huawei/ais-bench_workload/src/common/calc_resourceinfo.sh new file mode 100644 index 0000000000000000000000000000000000000000..4b034128ecf86f54a79a48f106a90013e6d7315a --- /dev/null +++ b/huawei/ais-bench_workload/src/common/calc_resourceinfo.sh @@ -0,0 +1,88 @@ +#!/bin/bash +export npu_info=/var/log/npu_info.log +export gpu_info=/var/log/gpu_info.log +npu_monitor_pid=0 +gpu_monitor_pid=0 + +function calc_resourceinfo_npu() +{ + device_group_xp="$*" + npuandchip=($(npu-smi info -m | egrep -v "ID" | awk '{print $1" "$2" "$3}'| xargs)) + num_i=0 + one_average_usage=0 + set_result_file=$2 + while [[ num_i -lt ${#npuandchip[@]} ]]; do + for i in ${device_group_xp[*]}; do + if [ ${i} == ${npuandchip[num_i+2]} ];then + echo $i | grep -q '[^0-9]' + nl=$? + if [ $nl -eq 0 ];then + continue + fi + one_date_usage=$(cat ${npu_info} | awk -v a=${npuandchip[num_i]} -v b=${npuandchip[num_i+1]} '$1==a && $2==b {print $5}') + onesub_usage=$(echo $one_date_usage |xargs |sed 's/[[:space:]]/\+/g'|bc) + onesub_num=$(echo $one_date_usage |awk '{print NF}') + if [ ${onesub_num} -eq 0 ];then + one_average_usage=0 + else + one_average_usage=$(awk -v x=${onesub_usage} -v y=${onesub_num} 'BEGIN{print x/y}') + fi + echo "resource_util_ratio: $one_average_usage" + python3 ${set_result_file} "training" "resource_util_ratio" ${one_average_usage} + fi + done + let num_i=num_i+3 + done +} + +function calc_runing_resourceinfo_npu() { + if [ "$npu_monitor_pid" != "" ];then + kill $npu_monitor_pid > /dev/null 2>&1 + fi + calc_resourceinfo_npu "average_usage" $* + rm -rf ${npu_info} +} + +function run_resourceinfo_monitor_backgroud_npu() { + stdbuf -oL npu-smi info watch -d 5 >> ${npu_info} & + export npu_monitor_pid=$! +} + +function calc_resourceinfo_gpu() +{ + device_group_xp="$*" + one_average_usage=0 + set_result_file=$2 + echo "device_group_xp : " ${device_group_xp} + for i in ${device_group_xp[*]}; do + echo $i | grep -q '[^0-9]' + nl=$? + if [ $nl -eq 0 ];then + continue + fi + one_date_usage=$(cat ${gpu_info} | awk -v a="${i}," '$1==a {print $2}') + onesub_usage=$(echo $one_date_usage |xargs |sed 's/[[:space:]]/\+/g'|bc) + onesub_num=$(echo $one_date_usage |awk '{print NF}') + if [ ${onesub_num} -eq 0 ];then + one_average_usage=0 + else + one_average_usage=$(awk -v x=${onesub_usage} -v y=${onesub_num} 'BEGIN{print x/y}') + fi + echo "resource_util_ratio: $one_average_usage" + python3 ${set_result_file} "training" "resource_util_ratio" ${one_average_usage} + done +} + +function calc_runing_resourceinfo_gpu() +{ + if [ "$gpu_monitor_pid" != "" ];then + kill $gpu_monitor_pid > /dev/null 2>&1 + fi + calc_resourceinfo_gpu "average_usage" $* + rm -rf ${gpu_info} +} + +function run_resourceinfo_monitor_backgroud_gpu() { + stdbuf -oL nvidia-smi --query-gpu=index,utilization.gpu --format=csv -l 5 >> ${gpu_info} & + export gpu_monitor_pid=$! +} diff --git a/huawei/ais-bench_workload/src/common/calc_result.py b/huawei/ais-bench_workload/src/common/calc_result.py new file mode 100644 index 0000000000000000000000000000000000000000..60a9838ac9a844046f0fd28b57904a13871cb4a4 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/calc_result.py @@ -0,0 +1,43 @@ +import json +import os +import sys + +RESULT_PATH = sys.argv[1] +RANK_SIZE = sys.argv[2] + +total_throughput = 0 +accuracy = 0 +for rank_id in range(int(RANK_SIZE)): + file_name = "throughput_rank_{}".format(str(rank_id)) + log_path = os.path.join(RESULT_PATH, file_name) + if not os.path.exists(log_path): + print("{} file not exist".format(log_path)) + else: + f = open(log_path, 'r') + cur_throughput = float(f.read()) + print("{} file throught: {}".format(log_path, cur_throughput)) + total_throughput += cur_throughput + f.close() + +accuracy_file = os.path.join(RESULT_PATH, "eval_acc.log") +if not os.path.exists(accuracy_file): + print("{} file not exist".format(accuracy_file)) +else: + with open(accuracy_file, 'rb') as fd: + accuracy = float(fd.read()) + +print("throughput_ratio:{}".format(total_throughput)) +print("accuracy:{}".format(accuracy)) + +result = {'throughput_ratio': total_throughput, 'accuracy': accuracy} +result_file = os.path.join(RESULT_PATH, "result.log") +with open(result_file, 'w') as f: + json.dump(result, f) + +try: + import ais_utils + ais_utils.set_result("training", "throughput_ratio", total_throughput) + ais_utils.set_result("training", "accuracy", float(accuracy)) + ais_utils.set_result("training", "result", "OK") +except: + sys.exit() diff --git a/huawei/ais-bench_workload/src/common/cluster_common.sh b/huawei/ais-bench_workload/src/common/cluster_common.sh new file mode 100644 index 0000000000000000000000000000000000000000..30946c383fb52710371b4cac896a18847475d1e2 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/cluster_common.sh @@ -0,0 +1,267 @@ +#!/bin/bash + +SSH="ssh -o StrictHostKeyChecking=no" +SCP="scp -o StrictHostKeyChecking=no" + +ssh_pass() +{ + local node="$1" + local user="$2" + local pd="$3" + local port="$4" + shift 4 + local cmd="$*" + + run_cmd="$node $cmd" + [ "$user" != "" ] && run_cmd="${user}@$run_cmd" + [ "$port" != "" ] && run_cmd="-p $port $run_cmd" + run_cmd="$SSH $run_cmd" + [ "$pd" != "" ] && run_cmd="sshpass -p ${pd} $run_cmd" + # echo "run_cmd:$run_cmd" + $run_cmd || { echo "run sshrun failed node:$node"; return 1; } +} + +scp_pass() +{ + local node="$1" + local user="$2" + local pd="$3" + local port="$4" + local src="$5" + local target="$6" + + run_cmd="${node}:${target}" + [ "$user" != "" ] && run_cmd="${user}@$run_cmd" + run_cmd="-r $src/* ${run_cmd}" + [ "$port" != "" ] && run_cmd="-P $port $run_cmd" + run_cmd="${SCP} ${run_cmd}" + [ "$pd" != "" ] && run_cmd="sshpass -p ${pd} $run_cmd" + # echo "run_cmd:$run_cmd" + $run_cmd || { echo "run scp failed node:$node"; return 1; } +} + +rscp_pass() +{ + local node="$1" + local user="$2" + local pd="$3" + local port="$4" + local src="$5" + local target="$6" + + run_cmd="${node}:${src}/* ${target}" + [ "$user" != "" ] && run_cmd="${user}@$run_cmd" + [ "$port" != "" ] && run_cmd="-P $port $run_cmd" + run_cmd="${SCP} -r ${run_cmd}" + [ "$pd" != "" ] && run_cmd="sshpass -p ${pd} $run_cmd" + # echo "run_cmd:$run_cmd" + $run_cmd || { echo "run rscp failed node:$node"; return 1; } +} + +get_cluster_list() +{ + local cluster_config=$1 + cat ${cluster_config} | python3 -c 'import sys,json;[print(node) for node in json.load(sys.stdin)["cluster"].keys()]' +} + +get_node_user() +{ + local cluster_config=$1 + local node=$2 + cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["user"])' 2>/dev/null +} + +get_node_pd() +{ + local cluster_config=$1 + local node=$2 + cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["pd"])' 2>/dev/null +} + +get_node_port() +{ + local cluster_config=$1 + local node=$2 + cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["port"])' 2>/dev/null +} + +local_run_cmd() +{ + local cmd="$*" + (eval $cmd) || { echo "Warn local run '${cmd}'"; return 1; } +} + +local_scp_cmd() +{ + local src_path="$2" + local dst_path="$3" + + [ -d $src_path ] || { echo "Warn src_path:$src_path not exist return";return 1; } + + # rm and mkdir dst path + # [ -d $dst_path ] && rm -rf $dst_path + mkdir -p $dst_path + + cp -rf $src_path/* $dst_path 2>/dev/null + return 0 +} + +# nodeinfo.json + +# { +# "cluster": { +# "xx.xx.xx.xx": { +# "user": "xxxx", +# "pd": "xx", +# "port": xx, +# }, +# "xx.xx.xx.xx": { +# } +# } +# } + +# interface function + +# 根据参数1串行调用命令 如果失败就返回 +# 参数1: 节点信息json文件,包含节点ip和用户名密码信息 如果为空即是本地调用 +# 参数其他: 运行的命令 +# 样例 cluster_run_cmd_serial "$NODEINFO_FILE" "ifconfig" +cluster_run_cmd_serial() +{ + local node_info_file=$1 + shift 1 + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_run_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + local cmd=$* + local node_arr=($(get_cluster_list ${node_info_file})) + local node_count=${#node_arr[@]} + + for ((i=0; i<$node_count; i++)); do { + local node="${node_arr[$i]}" + local user=$(get_node_user ${node_info_file} ${node}) + local pd=$(get_node_pd ${node_info_file} ${node}) + local port=$(get_node_port ${node_info_file} ${node}) + local cur_cmd="export SERVER_ID=${i}; ${cmd}" + ssh_pass "${node}" "${user}" "${pd}" "$port" "${cur_cmd}" || { echo "node:${node} ERROR when executing '${cur_cmd}'"; return 1; } + } + done + return 0 +} + +# 根据参数1项调用命令,只调用一个节点的命令 +# 参数1: 节点信息json文件,包含节点ip和用户名密码信息 如果为空即是本地调用 +# 参数其他: 运行命令 +# 样例 cluster_run_cmd_single "$NODEINFO_FILE" "ifconfig" +cluster_run_cmd_single() +{ + local node_info_file=$1 + shift 1 + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_run_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + local cmd=$* + local node_arr=($(get_cluster_list ${node_info_file})) + + local node="${node_arr[0]}" + local user=$(get_node_user ${node_info_file} ${node}) + local pd=$(get_node_pd ${node_info_file} ${node}) + local port=$(get_node_port ${node_info_file} ${node}) + local cur_cmd="export SERVER_ID=0; ${cmd}" + ssh_pass "${node}" "${user}" "${pd}" "$port" "${cur_cmd}" || { echo "node:${node} ERROR when executing '${cur_cmd}'"; return 1; } + return 0 +} + +# 根据参数1调用命令,并行运行,等待所有命令执行完 +# 参数1: 节点信息json文件,包含节点ip和用户名密码信息 如果为空即是本地调用 +# 参数其他: 运行命令 +# 样例 cluster_run_cmd_parallel "$NODEINFO_FILE" "ifconfig" +cluster_run_cmd_parallel() +{ + local node_info_file=$1 + shift 1 + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_run_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + local cmd=$* + local node_arr=($(get_cluster_list ${node_info_file})) + local node_count=${#node_arr[@]} + + local retvalfile=$(mktemp) + for ((i=0; i<$node_count; i++)); do { + local node="${node_arr[$i]}" + local user=$(get_node_user ${node_info_file} ${node}) + local pd=$(get_node_pd ${node_info_file} ${node}) + local port=$(get_node_port ${node_info_file} ${node}) + local cur_cmd="export SERVER_ID=${i}; $cmd" + ssh_pass "${node}" "${user}" "${pd}" "$port" ${cur_cmd} || { echo "node:${node} ERROR when executing '${cur_cmd}'"; rm -rf $retvalfile;} + } & + done + wait + [ -f $retvalfile ] || { echo "run train failed";return 1; } + rm -rf $retvalfile +} + +# 根据参数1拷贝主节点文件夹内容到各运行节点中 +# 注意 实际拷贝命令为 cp src_path/* dst_path 且dst_path会执行删除然后重建 +# 参数1: 节点信息json文件,包含节点ip和用户名密码信息 如果为空即是本地调用 +# 参数2: 主节点的源路径 src_path +# 参数3: 运行节点的源路径 dst_path +# 样例 cluster_scp "${NODEINFO_FILE}" "/home/src" "/home/dst" +cluster_scp() +{ + local node_info_file=$1 + local src_path=$2 + local dst_path=$3 + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_scp_cmd "$@";return $?; } + + local node_arr=($(get_cluster_list ${node_info_file})) + local node_count=${#node_arr[@]} + + for ((i=0; i<$node_count; i++)); do { + local node="${node_arr[$i]}" + local user=$(get_node_user ${node_info_file} ${node}) + local pd=$(get_node_pd ${node_info_file} ${node}) + local port=$(get_node_port ${node_info_file} ${node}) + scp_pass "${node}" "${user}" "${pd}" "$port" "${src_path}" "${dst_path}" || { echo "scp_pass failed node:$node"; return 1; } + echo "------------scp done------${user}@${node}---------------------" + } done +} + +# 根据参数1拷贝各运行节点文件内容到主节点文件夹中 +# 注意 实际拷贝命令为 cp src_path/* dst_path 且dst_path会创建 +# 参数1: 节点信息json文件,包含节点ip和用户名密码信息 如果为空即是本地调用 +# 参数2: 运行节点的源路径 src_path +# 参数3: 主节点的源路径 dst_path +# 样例 cluster_rscp "${NODEINFO_FILE}" "/home/src" "/home/dst" +cluster_rscp() +{ + local node_info_file=$1 + local src_path="$2" + local dst_path="$3" + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_scp_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + + local node_arr=($(get_cluster_list ${node_info_file})) + local node_count=${#node_arr[@]} + + for ((i=0; i<$node_count; i++)); do { + local node="${node_arr[$i]}" + local user=$(get_node_user ${node_info_file} ${node}) + local pd=$(get_node_pd ${node_info_file} ${node}) + local port=$(get_node_port ${node_info_file} ${node}) + echo "------------------${user}@${node}---------------------" + rscp_pass "${node}" "${user}" "${pd}" "$port" "${src_path}" "${dst_path}" || { echo "sshpass_rscp failed node:$node"; return 1; } + } done +} + diff --git a/huawei/ais-bench_workload/src/common/cluster_common_2.0.sh b/huawei/ais-bench_workload/src/common/cluster_common_2.0.sh new file mode 100644 index 0000000000000000000000000000000000000000..71f2ff20b9e4c81ade8e95da8cd080c0426f77a4 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/cluster_common_2.0.sh @@ -0,0 +1,128 @@ +#!bin/bash + +declare -i ret_ok=0 +declare -i ret_failed=1 + +local_run_cmd() +{ + local cmd="$1" + (eval "$cmd") || { echo "Warn local run '${cmd}'"; return $ret_failed; } +} + +local_cp_cmd() +{ + local src_path="$1" + local dst_path="$2" + if [ -f $src_path ]; then + cp -f $src_path $dst_path || { echo "cp: $src_path to $dst_path failed!";return $ret_failed; } + elif [ -d $src_path ]; then + cp -rf $src_path $dst_path || { echo "cp: $src_path to $dst_path failed!";return $ret_failed; } + else + echo "Warn src_path:$src_path not exist return" + return $ret_failed + fi + return $ret_ok +} + +local_put_cmd() +{ + local src_path="$1" + local dst_path=$WORK_PATH/"$2" + local_cp_cmd $src_path $dst_path || { return $ret_failed; } + return $ret_ok +} + +local_get_cmd() +{ + local src_path=$WORK_PATH/"$1" + local dst_path="$2" + local_cp_cmd $src_path $dst_path || { return $ret_failed; } + return $ret_ok +} + +cluster_init() +{ + [ "$NODEINFO_FILE" == "" ] && { echo "NODEINFO_FILE not set, will not use cluster";return $ret_ok; } + if [ -n "$CLUSTER_SSH_KEY_PATH" ];then + $PYTHON_COMMAND -m ais_bench.cluster init -n $NODEINFO_FILE -s $CLUSTER_SSH_KEY_PATH || { return $ret_failed; } + elif [ $CLUSTER_AUTO_SET_KEY == 'on' ];then + $PYTHON_COMMAND -m ais_bench.cluster init -n $NODEINFO_FILE -a || { return $ret_failed; } + else + return $ret_failed + fi + return $ret_ok +} + +cluster_multi_exec() +{ + [ "$NODEINFO_FILE" == "" ] && { local_run_cmd "$@";return $?; } + local cmd="$1" + local mode="$2" + local device_num="$3" + _cmd="$PYTHON_COMMAND -m ais_bench.cluster multi_exec -c '$cmd' " + [ "$mode" != "" ] && _cmd="$_cmd -m $mode " + [ "$device_num" != "" ] && _cmd="$_cmd -d $device_num " + (eval "$_cmd") || { return $ret_failed; } + return $ret_ok +} + +cluster_single_exec() +{ + [ "$NODEINFO_FILE" == "" ] && { local_run_cmd "$@";return $?; } + local cmd="$1" + local node_id="$2" + local device_num="$3" + _cmd="$PYTHON_COMMAND -m ais_bench.cluster single_exec -c '$cmd' " + [ "$node_id" != "" ] && _cmd="$_cmd -m $node_id " + [ "$device_num" != "" ] && _cmd="$_cmd -d $device_num " + (eval "$_cmd") || { return $ret_failed; } + return $ret_ok +} + +cluster_multi_put() +{ + [ "$NODEINFO_FILE" == "" ] && { local_put_cmd "$@";return $?; } + local src="$1" + local dst="$2" + local mode="$3" + _cmd="$PYTHON_COMMAND -m ais_bench.cluster multi_put -s $src -d $dst " + [ "$mode" != "" ] && _cmd="$_cmd -m $mode " + (eval "$_cmd") || { return $ret_failed; } + return $ret_ok +} + +cluster_single_put() +{ + [ "$NODEINFO_FILE" == "" ] && { local_put_cmd "$@";return $?; } + local src="$1" + local dst="$2" + local mode="$3" + _cmd="$PYTHON_COMMAND -m ais_bench.cluster single_put -s $src -d $dst " + [ "$node_id" != "" ] && _cmd="$_cmd -m $node_id " + (eval "$_cmd") || { return $ret_failed; } + return $ret_ok +} + +cluster_multi_get() +{ + [ "$NODEINFO_FILE" == "" ] && { local_get_cmd "$@";return $?; } + local src="$1" + local dst="$2" + local mode="$3" + _cmd="$PYTHON_COMMAND -m ais_bench.cluster multi_get -s $src -d $dst " + [ "$mode" != "" ] && _cmd="$_cmd -m $mode " + (eval "$_cmd") || { return $ret_failed; } + return $ret_ok +} + +cluster_single_get() +{ + [ "$NODEINFO_FILE" == "" ] && { local_get_cmd "$@";return $?; } + local src="$1" + local dst="$2" + local mode="$3" + _cmd="$PYTHON_COMMAND -m ais_bench.cluster single_get -s $src -d $dst " + [ "$node_id" != "" ] && _cmd="$_cmd -m $node_id " + (eval "$_cmd") || { return $ret_failed; } + return $ret_ok +} \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/common.sh b/huawei/ais-bench_workload/src/common/common.sh new file mode 100644 index 0000000000000000000000000000000000000000..4676c73c03b5415d26cf4a65a8f37cbfd8ca8436 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/common.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +check_file_valid() +{ + if [ ! -f "$1" ]; then + return 1 + fi + return 0 +} + +check_path_valid() +{ + if [ ! -d "$1" ]; then + return 1 + fi + return 0 +} + +function check_command_exist() +{ + command=$1 + if type ${command} > /dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +check_python_package_is_install() +{ + local PYTHON_COMMAND=$1 + ${PYTHON_COMMAND} -c "import $2" >> /dev/null 2>&1 + ret=$? + if [ $ret != 0 ]; then + echo "python package:$1 not install" + return 1 + fi + return 0 +} + +check_mindspore_run_ok() +{ + local PYTHON_COMMAND=$1 + ${PYTHON_COMMAND} -c "import mindspore;mindspore.run_check()" >> /dev/null 2>&1 + ret=$? + if [ $ret != 0 ]; then + echo "mindspore run not ok" + return 1 + fi +} + +check_mindspore_run_ok_Ascend() +{ + local PYTHON_COMMAND=$1 + ${PYTHON_COMMAND} -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" >> /dev/null 2>&1 + ret=$? + if [ $ret != 0 ]; then + echo "mindspore run not ok" + return 1 + fi +} \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/log_util.sh b/huawei/ais-bench_workload/src/common/log_util.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f58b68e509539c32f4a13f12c576b42c8778f04 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/log_util.sh @@ -0,0 +1,60 @@ +#!/bin/bash +MODE_NAME="Ais-Bench-Stubs" + +date_format="+%Y-%m-%dT%T" + +# Some useful colors. +# check if stdout is a terminal and support colors... +if [ -t 1 ] && [ "1$(tput colors 2>/dev/null)" -ge 18 ]; then + readonly color_red="$(tput setaf 1)" + readonly color_yellow="$(tput setaf 3)" + readonly color_green="$(tput setaf 2)" + readonly color_norm="$(tput sgr0)" +else + readonly color_red="" + readonly color_yellow="" + readonly color_green="" + readonly color_norm="" +fi + +if command -v caller >/dev/null 2>&1; then + # return func(lineno:filename) + # NOTE: skip 2-level inner frame + _caller() { caller 2| awk '{sub(/.*\//,e,$3);print $2"("$3":"$1") "}'; } +else + _caller() { :; } +fi + +_log() +{ + level=$1 + shift 1 + echo "$(date ${date_format}) -${MODE_NAME}- ${level} $(_caller)- $*" +} + + +logger_Debug() +{ + echo "Debug $(_caller): $@" +} + +logger_Info() +{ + _log INFO "$@" +} + +logger_Warn() +{ + _log WARN "${color_yellow}$*${color_norm}" +} + +logger_Error() +{ + _log ERROR "${color_red}$*${color_norm}" +} + +die() +{ + _log ERROR "${color_red}$*${color_norm}" + exit 1 +} \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/modelarts_handler.py b/huawei/ais-bench_workload/src/common/modelarts_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..e2be624a9c37d5a06940884abcb74dd7ab312cf5 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/modelarts_handler.py @@ -0,0 +1,269 @@ +from re import S +from urllib.parse import urlparse +from obs import ObsClient, model +from modelarts.session import Session +from modelarts.estimator import JOB_STATE, Estimator +import time +import os + +import logging +logging.basicConfig(level = logging.DEBUG,format = '[%(levelname)s] %(message)s') +logger = logging.getLogger(__name__) + + +def get_config_value(config, key): + return None if config.get(key) == "" else config.get(key) + +def continue_waiting(job_info): + print("waiting for task, status %s, total time: %d(s)" % (JOB_STATE[job_info['status']], job_info['duration'] / 1000)) + +def exit_by_failure(job_info): + print("task failed, status %s, please check log on obs, exit" % (JOB_STATE[job_info['status']])) + raise RuntimeError('failed') + +func_table = { + 0: continue_waiting, + 1: continue_waiting, + 2: continue_waiting, + 3: exit_by_failure, + 4: continue_waiting, + 5: exit_by_failure, + 6: exit_by_failure, + 7: continue_waiting, + 8: continue_waiting, + 9: exit_by_failure, + 11: exit_by_failure, + 12: exit_by_failure, + 13: exit_by_failure, + 14: exit_by_failure, + 15: continue_waiting, + 16: exit_by_failure, + 17: exit_by_failure, + 18: continue_waiting, + 19: continue_waiting, + 20: continue_waiting, + 21: exit_by_failure, + 22: exit_by_failure +} + +# 调试需要 超时后停止 +def wait_for_job_timeout(job_instance): + count = 0 + while True: + time.sleep(10) + job_info = job_instance.get_job_info() + if job_info['status'] == 10: + print("task succeeded, total time %d(s)" % (job_info['duration'] / 1000)) + break + func_table[job_info['status']](job_info) + count = count + 1 + print("modelarts run time count:{}".format(count)) + if count == 6: + print("modelarts run match:{} 10 so exit >>>>>>>".format(count)) + status = job_instance.stop_job_version() + #status = job_instance.delete_job() + raise RuntimeError('failed') + break + +try: + import moxing as mox + moxing_import_flag = True +except: + moxing_import_flag = False + +class modelarts_handler(): + def __init__(self): + self.output_url = None + self.job_log_prefix = None + + def sync_job_log(self, session_config): + dstpath = os.path.join(os.getenv("BASE_PATH", "./"), "log") + if not os.path.exists(dstpath): + print("dstpath:{} not exist no get log") + return + for id in range(session_config.train_instance_count): + logurl = self.job_log_prefix + '-' + str(id) + '.log' + logname = os.path.basename(logurl) + logpath = os.path.join(dstpath, logname) + if self.session.obs.is_obs_path_exists(logurl): + self.session.obs.download_file(logurl, logpath) + #print("logurl:{} sync log to dstpath:{}".format(logurl, logpath)) + + def wait_for_job(self, job_instance, session_config): + count = 0 + while True: + time.sleep(10) + count = count + 1 + if count > 10: + count = 10 + self.sync_job_log(session_config) + job_info = job_instance.get_job_info() + if job_info['status'] == 10: + self.sync_job_log(session_config) + print("task succeeded, total time %d(s)" % (job_info['duration'] / 1000)) + break + func_table[job_info['status']](job_info) + + def create_obs_output_dirs(self, output_url): + if moxing_import_flag == True: + dstpath = output_url.replace("s3:", "obs:", 1) + logger.info("create obs outdir mox mkdir:{}".format(dstpath)) + mox.file.make_dirs(dstpath) + else: + bucket_name = output_url[5:].split('/')[0] + sub_dir = output_url.replace(f"s3://{bucket_name}/", "", 1) + logger.debug('create obs output{} subdir:{} bucket:{}'.format(output_url, sub_dir, bucket_name)) + resp = self.obsClient.putContent(bucket_name, sub_dir, content=None) + if resp.status < 300: + logger.debug('obs put content request ok') + else: + logger.warn('errorCode:{} msg:{}'.format(resp.errorCode, resp.errorMessage)) + raise RuntimeError('failed') + + def create_obs_handler(self, access_config): + if moxing_import_flag == False: + # 创建 obs登录句柄 + self.obsClient = ObsClient(access_key_id=access_config.access_key, + secret_access_key=access_config.secret_access_key, server=access_config.server) + + def create_session(self, access_config): + # 如下配置针对计算中心等专有云 通用云不需要设置 + if access_config.get("iam_endpoint") != "" and access_config.get("iam_endpoint") != None \ + and access_config.get("obs_endpoint") != "" and access_config.get("obs_endpoint") != None \ + and access_config.get("modelarts_endpoint") != "" and access_config.get("modelarts_endpoint") != None: + Session.set_endpoint(iam_endpoint=access_config.iam_endpoint, obs_endpoint=access_config.obs_endpoint, \ + modelarts_endpoint=access_config.modelarts_endpoint, region_name=access_config.region_name) + # 创建modelarts句柄 + self.session = Session(access_key=access_config.access_key, + secret_key=access_config.secret_access_key, + project_id=access_config.project_id, + region_name=access_config.region_name) + + def print_train_instance_types(self): + algo_info = Estimator.get_train_instance_types(modelarts_session=self.session) + print("get valid train_instance_types:{}".format(algo_info)) + + def stop_new_versions(self, session_config): + base_job_list_info = Estimator.get_job_list(modelarts_session=self.session, per_page=10, page=1, order="asc", search_content=session_config.job_name) + if base_job_list_info == None or base_job_list_info.get("job_total_count", 0) == 0: + print("find no match version return") + return + else: + pre_version_id = base_job_list_info["jobs"][0].get("version_id") + job_id = base_job_list_info["jobs"][0].get("job_id") + job_status = base_job_list_info["jobs"][0].get("status") + estimator = Estimator(modelarts_session=self.session, job_id=job_id, version_id=pre_version_id) + if JOB_STATE[job_status] == "JOBSTAT_INIT" \ + or JOB_STATE[job_status] == "JOBSTAT_IMAGE_CREATING" \ + or JOB_STATE[job_status] == "JOBSTAT_SUBMIT_TRYING" \ + or JOB_STATE[job_status] == "JOBSTAT_DEPLOYING" \ + or JOB_STATE[job_status] == "JOBSTAT_WAITING" \ + or JOB_STATE[job_status] == "JOBSTAT_RUNNING": + status = estimator.stop_job_version() + print("jobname:{} jobid:{} preversionid:{} jobstatus:{} stop status:{}".format( + session_config.job_name, job_id, pre_version_id, JOB_STATE[job_status], status)) + else: + print("jobname:{} jobid:{} preversionid:{} jobstatus:{} no need stop".format( + session_config.job_name, job_id, pre_version_id, JOB_STATE[job_status])) + return + + def get_job_name_next_new_version(self, session_config): + base_job_list_info = Estimator.get_job_list(modelarts_session=self.session, per_page=10, page=1, order="asc", search_content=session_config.job_name) + if base_job_list_info == None or base_job_list_info.get("job_total_count", 0) == 0: + return 1 + else: + pre_version_id = base_job_list_info["jobs"][0].get("version_id") + job_id = base_job_list_info["jobs"][0].get("job_id") + estimator = Estimator(modelarts_session=self.session, job_id=job_id, version_id=pre_version_id) + job_info = estimator.get_job_info() + pre_version_id = job_info.get("version_name", "V0")[1:] + return int(pre_version_id)+1 + + def get_obs_url_content(self, obs_url): + if moxing_import_flag == True: + dsturl = obs_url.replace("s3:", "obs:", 1) + with mox.file.File(dsturl, 'r') as f: + file_str = f.read() + return file_str + else: + bucket_name = obs_url[5:].split('/')[0] + obs_sub_path = obs_url.replace(f"s3://{bucket_name}/", "", 1) + resp = self.obsClient.getObject(bucket_name, obs_sub_path, loadStreamInMemory=True) + if resp.status < 300: + logger.debug('request ok') + return resp.body.buffer.decode("utf-8") + else: + raise RuntimeError('obs get object ret:{} url:{} bucket:{} path:{}'.format(resp.status, obs_url, bucket_name, obs_sub_path)) + + + def update_code_to_obs(self, session_config, localpath): + # 待完善 验证 + if moxing_import_flag == True: + dstpath = "obs:/" + session_config.code_dir + logger.info("mox update loaclpath:{} dstpath:{}".format(localpath, dstpath)) + mox.file.copy_parallel(localpath, dstpath) + else: + bucket_name = session_config.code_dir.split('/')[1] + sub_dir = "/".join(session_config.code_dir.strip("/").split('/')[1:]) + logger.info("update code localpath:{} codepath:{} bucket:{} subdir:{}".format( + localpath, session_config.code_dir, bucket_name, sub_dir)) + resp = self.obsClient.putFile(bucket_name, sub_dir, localpath) + + def create_modelarts_job(self, session_config, output_url): + jobdesc = session_config.job_description_prefix + "_jobname_" + session_config.job_name + "_" + str(session_config.train_instance_type) + "_" + str(session_config.train_instance_count) + estimator = Estimator(modelarts_session=self.session, + framework_type=session_config.framework_type, + framework_version=session_config.framework_version, + code_dir=session_config.code_dir, + boot_file=session_config.boot_file, + log_url=output_url[4:], + hyperparameters=session_config.hyperparameters, + output_path=output_url[4:], + pool_id = get_config_value(session_config, "pool_id"), + train_instance_type = get_config_value(session_config, "train_instance_type"), + train_instance_count=session_config.train_instance_count, + nas_type = get_config_value(session_config, "nas_type"), + nas_share_addr = get_config_value(session_config, "nas_share_addr"), + nas_mount_path = get_config_value(session_config, "nas_mount_path"), + job_description=jobdesc, + user_command = None) + + base_job_list_info = Estimator.get_job_list(modelarts_session=self.session, per_page=10, page=1, order="asc", search_content=session_config.job_name) + if base_job_list_info == None or base_job_list_info.get("job_total_count", 0) == 0: + logger.debug("new create inputs:{} job_name:{}".format(session_config.inputs, session_config.job_name)) + job_instance = estimator.fit(inputs=session_config.inputs, wait=False, job_name=session_config.job_name) + else: + job_id = base_job_list_info["jobs"][0].get("job_id") + pre_version_id = base_job_list_info["jobs"][0].get("version_id") + logger.debug("new versions job_id:{} pre_version_id:{}".format(job_id, pre_version_id)) + job_instance = estimator.create_job_version(job_id=job_id, pre_version_id=pre_version_id, inputs=session_config.inputs, wait=False, job_desc=jobdesc) + + print("inputs:{} job_name:{} ret instance:{}".format(session_config.inputs, session_config.job_name, job_instance)) + job_info = job_instance.get_job_info() + if not job_info['is_success']: + logger.error("failed to run job on modelarts, msg %s" % (job_info['error_msg'])) + raise RuntimeError('failed') + + self.job_log_prefix = "obs:/" + output_url[4:] + job_info["resource_id"] + "-job-" + session_config.job_name + + print("create sucess job_id:{} resource_id:{} version_name:{} create_time:{}".format( + job_info["job_id"], job_info["resource_id"], job_info["version_name"], job_info["create_time"])) + return job_instance + + def run_job(self, session_config, localpath): + logger.debug("session config:{}".format(session_config)) + + self.print_train_instance_types() + + # 获取job_name的next 版本号 + next_version_id = self.get_job_name_next_new_version(session_config) + # 生成输出路径 + self.output_url = os.path.join("s3:/{}".format(session_config.out_base_url), "V{}".format(next_version_id), "") + logger.debug("output_url:{}".format(self.output_url)) + self.create_obs_output_dirs(self.output_url) + + # 更新代码到obs上 + self.update_code_to_obs(session_config, localpath) + + job_instance = self.create_modelarts_job(session_config, self.output_url) + self.wait_for_job(job_instance, session_config) \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/modelarts_handler_v2.py b/huawei/ais-bench_workload/src/common/modelarts_handler_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..d37b605d9d33ff3c06f3d802091ac96a8291ea93 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/modelarts_handler_v2.py @@ -0,0 +1,201 @@ +import logging +import os +import time + +from modelarts.estimatorV2 import JOB_STATE, Estimator +from modelarts.session import Session +from modelarts.train_params import InputData, OutputData, TrainingFiles +from obs import ObsClient + +logging.basicConfig(level=logging.DEBUG, format='[%(levelname)s] %(message)s') +logger = logging.getLogger(__name__) + + +def get_config_value(config, key): + return None if config.get(key) == "" else config.get(key) + + +try: + import moxing as mox + moxing_import_flag = True +except Exception: + moxing_import_flag = False + + +class modelarts_handler(): + RESP_OK = 300 + OBS_PATH_HEAD = "obs:/" + + def __init__(self): + self.output_url = None + self.job_log_prefix = None + self.job_name = None + self.job_instance = None + self.session_config = None + self.bucket_name = None + + def sync_job_log(self, session_config): + dstpath = os.path.join(os.getenv("BASE_PATH", "./"), "log") + if not os.path.exists(dstpath): + os.makedirs(dstpath) + for id in range(session_config.train_instance_count): + logurl = self.job_log_prefix + '-' + str(id) + '.log' + logname = os.path.basename(logurl) + logpath = os.path.join(dstpath, logname) + if self.session.obs.is_obs_path_exists(logurl): + self.session.obs.download_file(logurl, logpath) + + def wait_for_job(self): + count = 0 + while True: + time.sleep(10) + count = count + 1 + if count % 10 == 0: + self.sync_job_log(self.session_config) + job_info = self.job_instance.get_job_info() + + phase = job_info['status']['phase'] + if phase == "Completed": + self.sync_job_log(self.session_config) + logger.info("task succeeded, total time %d(s)" % (job_info['status']['duration'] / 1000)) + break + elif phase in ['Failed', 'Abnormal', 'Terminated']: + print("task failed, phase %s, please check log on obs, exit" % (job_info['status']['phase'])) + raise RuntimeError('job failed') + else: + print("waiting for task, phase %s, total time: %d(s), actual training time: %d(s) " + % (job_info['status']['phase'], 10 * count, job_info['status']['duration'] / 1000)) + + def create_obs_output_dirs(self, output_url): + if moxing_import_flag: + dstpath = self.OBS_PATH_HEAD + output_url + logger.info("create obs outdir mox mkdir:{}".format(dstpath)) + mox.file.make_dirs(dstpath) + else: + sub_dir = output_url.replace(f"/{self.bucket_name}/", "", 1) + logger.debug('create obs output{} subdir:{} bucket:{}'.format(output_url, sub_dir, self.bucket_name)) + resp = self.obsClient.putContent(self.bucket_name, sub_dir, content=None) + if resp.status < self.RESP_OK: + logger.debug('obs put content request ok') + else: + logger.warn('create obs folder failed. errorCode:{} msg:{}'.format(resp.errorCode, resp.errorMessage)) + raise RuntimeError('create obs folder failed') + + def create_obs_handler(self, access_config): + if not moxing_import_flag: + # Create OBS login handle + self.obsClient = ObsClient(access_key_id=access_config.access_key, + secret_access_key=access_config.secret_access_key, server=access_config.server) + + def create_session(self, access_config): + # 如下配置针对计算中心等专有云 通用云不需要设置 + if access_config.get("iam_endpoint") != "" and access_config.get("iam_endpoint") is not None \ + and access_config.get("obs_endpoint") != "" and access_config.get("obs_endpoint") is not None \ + and access_config.get("modelarts_endpoint") != "" and access_config.get("modelarts_endpoint") is not None: + Session.set_endpoint(iam_endpoint=access_config.iam_endpoint, obs_endpoint=access_config.obs_endpoint, + modelarts_endpoint=access_config.modelarts_endpoint, + region_name=access_config.region_name) + # Create modelars handle + self.session = Session(access_key=access_config.access_key, + secret_key=access_config.secret_access_key, + project_id=access_config.project_id, + region_name=access_config.region_name) + + def print_train_instance_types(self): + algo_info = Estimator.get_train_instance_types(self.session) + print("get valid train_instance_types:{}".format(algo_info)) + + def stop_job(self, job_id): + job_info = Estimator.control_job_by_id(session=self.session, job_id=job_id) + print("job stop status: {}".format(job_info["status"]["phase"])) + + def get_obs_url_content(self, obs_url): + if moxing_import_flag: + dsturl = self.OBS_PATH_HEAD + obs_url + with mox.file.File(dsturl, 'r') as f: + file_str = f.read() + return file_str + else: + obs_sub_path = obs_url.replace(f"/{self.bucket_name}/", "", 1) + resp = self.obsClient.getObject(self.bucket_name, obs_sub_path, loadStreamInMemory=True) + if resp.status < self.RESP_OK: + logger.debug('request ok') + return resp.body.buffer.decode("utf-8") + else: + raise RuntimeError('obs get object ret:{} url:{} bucket:{} \ + path:{}'.format(resp.status, obs_url, self.bucket_name, obs_sub_path)) + + def update_code_to_obs(self, localpath): + if moxing_import_flag: + dstpath = self.OBS_PATH_HEAD + self.session_config.code_dir + logger.info("mox update loaclpath:{} dstpath:{}".format(localpath, dstpath)) + mox.file.copy_parallel(localpath, dstpath) + else: + sub_dir = "/".join(self.session_config.code_dir.strip("/").split('/')[1:]) + logger.info("update code localpath:{} codepath:{} bucket:{} subdir:{}".format( + localpath, self.session_config.code_dir, self.bucket_name, sub_dir)) + print("bucket_name:{} sub_dir: {} localpath:{}".format(self.bucket_name, sub_dir, localpath)) + self.obsClient.putFile(self.bucket_name, sub_dir, localpath) + + def create_modelarts_job(self, output_url): + jobdesc = self.session_config.job_description_prefix + "_jobname_" + self.job_name + "_" +\ + str(self.session_config.train_instance_type) + "_" + str(self.session_config.train_instance_count) + + output_list = [OutputData(obs_path=self.OBS_PATH_HEAD + self.session_config.out_base_url + self.job_name + "/", + name="train_url")] + + estimator = Estimator(session=self.session, + framework_type=self.session_config.framework_type, + framework_version=self.session_config.framework_version, + training_files=TrainingFiles(code_dir=self.OBS_PATH_HEAD + self.session_config.code_dir, + boot_file=self.OBS_PATH_HEAD + self.session_config.boot_file), + log_url=self.OBS_PATH_HEAD + output_url, + parameters=self.session_config.parameters, + outputs=output_list, + pool_id=get_config_value(self.session_config, "pool_id"), + train_instance_type=get_config_value(self.session_config, "train_instance_type"), + train_instance_count=self.session_config.train_instance_count, + job_description=jobdesc, + user_command=None) + + logger.debug("new create inputs:{} job_name:{}".format(self.session_config.inputs, self.job_name)) + inut_list = [InputData(obs_path=self.OBS_PATH_HEAD + self.session_config.inputs, name="data_url")] + try: + job_instance = estimator.fit(inputs=inut_list, wait=False, job_name=self.job_name) + except Exception as e: + logger.error("failed to create job on modelarts, msg %s" % (e)) + raise RuntimeError('creat job failed') + + logger.debug("inputs:{} job_name:{} ret instance:{}".format(inut_list, self.job_name, job_instance)) + job_info = job_instance.get_job_info() + print("\njob_info: {}\n".format(job_info)) + + if 'error_msg' in job_info.keys(): + logger.error("failed to run job on modelarts, error_msg: %s error_code:\ + %s error_solution: %s" % (job_info['error_msg'], job_info['error_code'], job_info['error_solution'])) + raise RuntimeError('creat job failed') + + self.job_log_prefix = self.OBS_PATH_HEAD + output_url + "modelarts-job-" + job_info['metadata']['id'] + '-worker' + print("create job sucess. job_id:{} job name:{} create_time:{} job_log_prefix:{}".format( + job_info["metadata"]["id"], job_info["metadata"]["name"], job_info["metadata"]["create_time"], + self.job_log_prefix)) + + return job_instance + + def run_job(self, session_config, localpath): + logger.debug("session config:{}".format(self.session_config)) + timestr = time.strftime("%Y_%m_%d-%H_%M_%S") + self.session_config = session_config + self.job_name = self.session_config.job_name + timestr + self.print_train_instance_types() + # modelarts path end with '/',or report error ModelArts.2791 + self.output_url = os.path.join(self.session_config.out_base_url, self.job_name, "") + self.bucket_name = self.session_config.out_base_url.split('/')[1] + logger.debug("output_url:{}".format(self.output_url)) + self.create_obs_output_dirs(self.output_url) + + # update code to obs + self.update_code_to_obs(localpath) + + self.job_instance = self.create_modelarts_job(self.output_url) + self.wait_for_job() diff --git a/huawei/ais-bench_workload/src/common/node_common.sh b/huawei/ais-bench_workload/src/common/node_common.sh new file mode 100644 index 0000000000000000000000000000000000000000..d00be79d8c8e2059ce7bd5ebceb265f32349ae30 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/node_common.sh @@ -0,0 +1,113 @@ + +get_node_podname() +{ + local rank_table_file=$1 + local server_id=$2 + cat ${rank_table_file} | python3 -c 'import sys,json;print(json.load(sys.stdin)["group_list"][0]["instance_list"]['${server_id}']["pod_name"])' #2>/dev/null +} + +# 通用检测 在主节点上检测环境是否正常 +check_env_common() +{ + : "${RANK_SIZE?RANK_SIZE not set}" + : "${DEVICE_NUM?DEVICE_NUM not set}" + + # check ranktable set + [[ $RANK_SIZE -eq 1 ]] || : "${RANK_TABLE_FILE?RANK_TABLE_FILE not set}" + [[ $RANK_SIZE -eq 1 ]] && [[ -n "$RANK_TABLE_FILE" ]] && { echo "ranksize=1 should not set RANK_TABLE_FILE";return 1; } + + : "${PYTHON_COMMAND?PYTHON_COMMAND not set}" + + # check nodeinfofile exist + # [[ $RANK_SIZE -le 8 ]] || check_file_valid "${NODEINFO_FILE}" || { echo "nodeinfofile:${NODEINFO_FILE} not valid" ; return 1; } + + # check basic command of the main node + if [ -f "$NODEINFO_FILE" ];then + check_command_exist ssh || { echo "ssh running failed" ; return 1; } + echo "ssh running successfully" + check_command_exist sshpass || { echo "sshpass running failed" ; return 1; } + echo "sshpass running successfully" + fi + return 0 +} + +# 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE +node_common_check() +{ + local pythoncmd="$1" + local ranksize="$2" + local ranktable="$3" + check_command_exist ${pythoncmd} || { logger_Warn "python:$pythoncmd running failed" ; return 1; } + echo "${pythoncmd} running successfully" + + if [ ${ranksize} != 1 ]; then + check_file_valid "$ranktable" || { logger_Warn "RANK_TABLE_FILE:${ranktable} not valid path" ; return 1; } + echo "RANK_TABLE_FILE path valid" + fi + return 0 +} + +# 通用训练函数调用 +# 必须依赖变量包括 WORK_PATH get_train_cmd 会做检查 +# 参数1 是否绑核 如果需要 传入 "true" +# 参数2 是否老的ranktable 如果需要 传入 "true" + +function node_common_train() +{ + [ -d "$WORK_PATH" ] || { echo "not exit WORK_PATH return";return 1; } + [ "$(type -t get_train_cmd)" == 'function' ] || { echo "not exist get_train_cmd func return";return 1; } + + bindcore=$([ "$1" == "true" ] && echo "true" || echo "false") + oldranktable=$([ "$2" == "true" ] && echo "true" || echo "false") + + # get server node id default is 0 + : "${SERVER_ID:=0}" + # get rank start index + if [[ $DEVICE_NUM == 1 && $RANK_SIZE == 1 ]];then + : "${SINGLE_CARD_INDEX:=0}" + RANK_START=$SINGLE_CARD_INDEX + else + # get rank start index + RANK_START=`expr ${SERVER_ID} \* $DEVICE_NUM` + fi + # set bind core + [ $bindcore == "true" ] && { cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`; avg=`expr $cpus \/ $DEVICE_NUM`; gap=`expr $avg \- 1`; } + if [ $oldranktable == "true" ];then + podname=$(get_node_podname ${RANK_TABLE_FILE} ${SERVER_ID}) + fi + + retvalfile=$(mktemp) + for((i=0;i<${DEVICE_NUM};i++));do + { + index=$[i+RANK_START] + export DEVICE_ID=${i} + # old ranktable should set DEVICE_INDEX and new ranktable should set RANK_ID + if [ $oldranktable == "true" ];then + export DEVICE_INDEX=$[i+RANK_START] + export RANK_ID=$podname + else + export RANK_ID=$[i+RANK_START] + # 应该是device_id吧 + export ASCEND_DEVICE_ID=$DEVICE_ID + export DEVICE_INDEX=$DEVICE_ID + export RANK_INDEX=$SERVER_ID + fi + # clear and create path. + RUN_PATH="$WORK_PATH/train_parallel$index" + mkdir -p $RUN_PATH; cd $RUN_PATH; + # if bindcore should get cmdopt for cores + [ $bindcore == "true" ] && { start=`expr $i \* $avg`; end=`expr $start \+ $gap`; cmdopt=$start"-"$end; } + # call out func get run cmd + get_train_cmd + logger_Info "start training for SERVER_ID:$SERVER_ID rank $index, device $DEVICE_ID begin cmd:$train_run_cmd" + # if bindcore add taskset + [ $bindcore == "true" ] && train_run_cmd="taskset -c $cmdopt $train_run_cmd" + # call cmd + eval $train_run_cmd | tee -a $RUN_PATH/train.log 2>&1 || { logger_Warn "train failed rank $index, device $DEVICE_ID failed:$?"; rm -rf $retvalfile; } + } & + done + logger_Info "Waiting for the training process of SERVER_ID:${SERVER_ID} to finish" + wait + [ -f $retvalfile ] || { logger_Warn "run train failed";return 1; } + logger_Info "SERVER_ID:${SERVER_ID} training finished" +} diff --git a/huawei/ais-bench_workload/src/common/patch_common.sh b/huawei/ais-bench_workload/src/common/patch_common.sh new file mode 100644 index 0000000000000000000000000000000000000000..107df9ae56a1c472888b5832e47f07e81e75cbef --- /dev/null +++ b/huawei/ais-bench_workload/src/common/patch_common.sh @@ -0,0 +1,67 @@ + +get_modelzoo_base_code_by_git(){ + [ -z $git_url ] && { echo "args git_url not exist";return 1; } + [ -z $branch ] && { echo "args branch not exist";return 1; } + [ -z $modelzoo_sub_dir ] && { echo "args modelzoo_sub_dir not exist";return 1; } + [ -z $commitid ] && { echo "args commitid not exist";return 1; } + + git clone $git_url -b $branch || { echo "warn git clone failed"; return 1; } + code_dir=${modelzoo_sub_dir%%/*} + + cd ${code_dir} + git reset --hard $commitid || { echo "warn git reset failed"; return 1; } + cd - +} + +make_patch(){ + [ -z $BUILD_TMP_PATH ] && { echo "args BUILD_TMP_PATH not exist";return 1; } + [ -z $modelzoo_sub_dir ] && { echo "args modelzoo_sub_dir not exist";return 1; } + [ -z $target_dir ] && { echo "args target_dir not exist";return 1; } + [ -z $patch_file_name ] && { echo "args patch_file_name not exist";return 1; } + [ -z $CUR_PATH ] && { echo "args patch_file_name not exist";return 1; } + + cd $BUILD_TMP_PATH + get_modelzoo_base_code_by_git || { echo "warn git getcode failed"; return 1; } + cp $modelzoo_sub_dir -rf $BUILD_TMP_PATH/origin + cp $target_dir -rf $BUILD_TMP_PATH/code + + diff -Nur origin code > $BUILD_TMP_PATH/$patch_file_name.patch + + cp $BUILD_TMP_PATH/$patch_file_name.patch $CUR_PATH/ +} + +load_code(){ + [ -z $BUILD_TMP_PATH ] && { echo "args BUILD_TMP_PATH not exist";return 1; } + [ -z $modelzoo_sub_dir ] && { echo "args modelzoo_sub_dir not exist";return 1; } + [ -z $patch_file_name ] && { echo "args patch_file_name not exist";return 1; } + [ -z $target_patchcode_dir ] && { echo "args target_patchcode_dir not exist";return 1; } + [ -z $CUR_PATH ] && { echo "args CUR_PATH not exist";return 1; } + + cd $BUILD_TMP_PATH + get_modelzoo_base_code_by_git || { echo "warn git getcode failed"; return 1; } + cp $modelzoo_sub_dir -rf $BUILD_TMP_PATH/origin + cp $modelzoo_sub_dir -rf $BUILD_TMP_PATH/code + + if [ -f $CUR_PATH/$patch_file_name.patch ];then + patch -p0 < $CUR_PATH/$patch_file_name.patch || { echo "warn patch pfile failed"; return 1; } + else + echo "no patch file" + fi + [ ! -d $target_patchcode_dir ] || rm -rf $target_patchcode_dir + mkdir $target_patchcode_dir + cp $BUILD_TMP_PATH/code/* -rf $target_patchcode_dir/ +} + +mk_version_file() +{ + version_files=$1 + echo "git_url: $git_url" > $version_files + echo "branch: $branch" >> $version_files + echo "commitid: $commitid" >> $version_files + if [ -f $CUR_PATH/$patch_file_name.patch ];then + echo "patch_file_name: $patch_file_name.patch" >> $version_files + else + echo "patch_file_name: None" >> $version_files + fi + +} diff --git a/huawei/ais-bench_workload/src/common/sshpass_common.sh b/huawei/ais-bench_workload/src/common/sshpass_common.sh new file mode 100644 index 0000000000000000000000000000000000000000..3dfd4baa32c12a4927895f82f7b13a6147045c4e --- /dev/null +++ b/huawei/ais-bench_workload/src/common/sshpass_common.sh @@ -0,0 +1,212 @@ +#!/bin/bash + +SSH="ssh -o StrictHostKeyChecking=no" +SCP="scp -o StrictHostKeyChecking=no" + +ssh_pass() +{ + local node="$1" + local user="$2" + local pd="$3" + local port="$4" + shift 4 + local cmd="$*" + + run_cmd="$node $cmd" + [ "$user" != "" ] && run_cmd="${user}@$run_cmd" + [ "$port" != "" ] && run_cmd="-p $port $run_cmd" + run_cmd="$SSH $run_cmd" + [ "$pd" != "" ] && run_cmd="sshpass -p ${pd} $run_cmd" + echo "run_cmd:$run_cmd" + $run_cmd || { echo "run sshrun failed node:$node"; return 1; } +} + +scp_pass() +{ + local node="$1" + local user="$2" + local pd="$3" + local port="$4" + local src="$5" + local target="$6" + + run_cmd="${node}:${target}" + [ "$user" != "" ] && run_cmd="${user}@$run_cmd" + run_cmd="-r $src ${run_cmd}" + [ "$port" != "" ] && run_cmd="-P $port $run_cmd" + run_cmd="${SCP} ${run_cmd}" + [ "$pd" != "" ] && run_cmd="sshpass -p ${pd} $run_cmd" + echo "run_cmd:$run_cmd" + $run_cmd || { echo "run scp failed node:$node"; return 1; } +} + +rscp_pass() +{ + local node="$1" + local user="$2" + local pd="$3" + local port="$4" + local src="$5" + local target="$6" + + run_cmd="${node}:${src} ${target}" + [ "$user" != "" ] && run_cmd="${user}@$run_cmd" + [ "$port" != "" ] && run_cmd="-P $port $run_cmd" + run_cmd="${SCP} -r ${run_cmd}" + [ "$pd" != "" ] && run_cmd="sshpass -p ${pd} $run_cmd" + echo "run_cmd:$run_cmd" + $run_cmd || { echo "run rscp failed node:$node"; return 1; } +} + +get_cluster_list() +{ + local cluster_config=$1 + cat ${cluster_config} | python3 -c 'import sys,json;[print(node) for node in json.load(sys.stdin)["cluster"].keys()]' +} + +get_node_user() +{ + local cluster_config=$1 + local node=$2 + cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["user"])' 2>/dev/null +} + +get_node_pd() +{ + local cluster_config=$1 + local node=$2 + cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["pd"])' 2>/dev/null +} + +get_node_port() +{ + local cluster_config=$1 + local node=$2 + cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["port"])' 2>/dev/null +} + +local_run_cmd() +{ + local cmd="$*" + (eval $cmd) || { echo "Warn local run '${cmd}'"; return 1; } +} + +local_scp_cmd() +{ + local src_path="$2" + local dst_path="$3" + + [ -d $src_path ] || { echo "Warn src_path:$src_path not exist return";return 1; } + + # rm and mkdir dst path + [ -d $dst_path ] && rm -rf $dst_path + mkdir -p $dst_path + + cp -rf "$src_path" $dst_path || { echo "Warn local cp failed";return 1; } +} + +# interface function + +cluster_run_cmd_serial() +{ + local node_info_file=$1 + shift 1 + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_run_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + local cmd=$* + local node_arr=($(get_cluster_list ${node_info_file})) + local node_count=${#node_arr[@]} + + for ((i=0; i<$node_count; i++)); do { + local node="${node_arr[$i]}" + local user=$(get_node_user ${node_info_file} ${node}) + local pd=$(get_node_pd ${node_info_file} ${node}) + local port=$(get_node_port ${node_info_file} ${node}) + local cur_cmd="export SERVER_ID=${i}; ${cmd}" + ssh_pass "${node}" "${user}" "${pd}" "$port" "${cur_cmd}" || { echo "EROOR when executing '${cur_cmd}'"; return 1; } + } + done + return 0 +} + +cluster_run_cmd_parallel() +{ + node_info_file=$1 + shift 1 + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_run_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + cmd=$* + node_arr=($(get_cluster_list ${node_info_file})) + node_count=${#node_arr[@]} + + retvalfile=$(mktemp) + for ((i=0; i<$node_count; i++)); do { + node="${node_arr[$i]}" + user=$(get_node_user ${node_info_file} ${node}) + pd=$(get_node_pd ${node_info_file} ${node}) + port=$(get_node_port ${node_info_file} ${node}) + cur_cmd="export SERVER_ID=${i}; $cmd" + echo "正在登陆${user}@${node},SERVER_ID为${i}, 进入后执行的命令为${cur_cmd}" + ssh_pass "${node}" "${user}" "${pd}" "$port" ${cur_cmd} || { echo "run scp failed node:$node"; rm -rf $retvalfile; } + }& + done + logger_Info "now wait run cmd done" + wait + [ -f $retvalfile ] || { echo "run train failed";return 1; } + rm -rf $retvalfile + logger_Info "now run cmd done finish" +} + +sshpass_scp_cmd() +{ + node_info_file=$1 + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_scp_cmd "$@";return $?; } + + src_path=$2 + dst_path=$3 + node_arr=($(get_cluster_list ${node_info_file})) + node_count=${#node_arr[@]} + + for ((i=0; i<$node_count; i++)); do { + node="${node_arr[$i]}" + user=$(get_node_user ${node_info_file} ${node}) + pd=$(get_node_pd ${node_info_file} ${node}) + port=$(get_node_port ${node_info_file} ${node}) + echo "------------scp ------${user}@${node}---------------------" + ssh_pass "${node}" "${user}" "${pd}" "$port" "rm -rf ${dst_path};mkdir -p ${dst_path}" || { echo "sshpass_scp failed node:$node"; return 1; } + scp_pass "${node}" "${user}" "${pd}" "$port" "${src_path}" "${dst_path}" || { echo "sshpass_scp failed node:$node"; return 1; } + echo "------------scp done------${user}@${node}---------------------" + } done +} + +sshpass_rscp_cmd() +{ + node_info_file=$1 + + # clusterconfig file not set as local mode + [ "$node_info_file" == "" ] && { local_scp_cmd "$@";return $?; } + + [ -f $node_info_file ] || { echo "$node_info_file not exist ret";return 1; } + + src_path="$2" + dst_path="$3" + node_arr=($(get_cluster_list ${node_info_file})) + node_count=${#node_arr[@]} + + for ((i=0; i<$node_count; i++)); do { + node="${node_arr[$i]}" + user=$(get_node_user ${node_info_file} ${node}) + pd=$(get_node_pd ${node_info_file} ${node}) + port=$(get_node_port ${node_info_file} ${node}) + echo "------------------${user}@${node}---------------------" + # ssh_pass ${node} ${user} ${pd} "rm -rf ${dst_path}" + rscp_pass "${node}" "${user}" "${pd}" "$port" "${src_path}" "${dst_path}" || { echo "sshpass_rscp failed node:$node"; return 1; } + } done +} \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/common/train_modelarts.py b/huawei/ais-bench_workload/src/common/train_modelarts.py new file mode 100644 index 0000000000000000000000000000000000000000..dc2c66f05fc85ce80735d55b7eb6ef848bb9eee9 --- /dev/null +++ b/huawei/ais-bench_workload/src/common/train_modelarts.py @@ -0,0 +1,106 @@ +import argparse +import logging +import os +import sys +from statistics import mean + +sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')) + +import ais_utils +from config.modelarts_config import access_config +from config.modelarts_config import session_config as session_config_v1 +from config.modelarts_config import session_config_v2 +from modelarts_handler_v2 import modelarts_handler as modelarts_handler_v2 + +from modelarts_handler import logger, modelarts_handler + + +def report_result(handler): + ranksize_file_url = os.path.join(handler.output_url, 'ranksize.json') + ranksize = int(handler.get_obs_url_content(ranksize_file_url)) + print("url:{} read ranksize:{}".format(ranksize_file_url, ranksize)) + + total_throughput = 0.0 + for rankid in range(0, ranksize): + throughput_url = os.path.join(handler.output_url, 'throughput_' + str(rankid) + '.json') + single_throughput_rate = float(handler.get_obs_url_content(throughput_url)) + print("rankid:{} url:{} read throughput:{}".format(rankid, throughput_url, single_throughput_rate)) + total_throughput = total_throughput + single_throughput_rate + print("report result total_throughput : {}".format(total_throughput)) + ais_utils.set_result("training", "throughput_ratio", total_throughput) + + accuracy_file_url = os.path.join(handler.output_url, 'accuracy.json') + accuracy = float(handler.get_obs_url_content(accuracy_file_url)) + print("url:{} read accuracy:{}".format(accuracy_file_url, accuracy)) + + print("report result accuracy:{}".format(accuracy)) + ais_utils.set_result("training", "accuracy", accuracy) + +# 单设备运行模式 +def report_result_singlesever_mode(handler, server_count): + # 单设备运行模式下默认都是8卡 + cards_per_server = 8 + print("server_count:{} cards_per_server:{}".format(server_count, cards_per_server)) + + throughput_list = [] + accuracy_list = [] + for server_id in range(server_count): + single_server_throughput = 0.0 + for rankid in range(cards_per_server): + throughput_url = os.path.join(handler.output_url, str(server_id), 'throughput_' + str(rankid) + '.json') + single_card_throughput = float(handler.get_obs_url_content(throughput_url)) + print("rankid:{} url:{} read throughput:{}".format(rankid, throughput_url, single_card_throughput)) + single_server_throughput = single_server_throughput + single_card_throughput + print("serverid:{} count:{} service_throughput:{}".format(server_id, server_count, single_server_throughput)) + throughput_list.append(single_server_throughput) + + accuracy_file_url = os.path.join(handler.output_url, 'accuracy_{}.json'.format(server_id)) + single_server_accuracy = float(handler.get_obs_url_content(accuracy_file_url)) + print("serverid:{} url:{} read accuracy:{}".format(server_id, accuracy_file_url, single_server_accuracy)) + accuracy_list.append(single_server_accuracy) + + print("report >> throughput_list:{} average:{}".format(throughput_list, mean(throughput_list))) + print("report >> accuracy_list:{} average:{}".format(accuracy_list, mean(accuracy_list))) + + ais_utils.set_result("training", "throughput_ratio", mean(throughput_list)) + ais_utils.set_result("training", "accuracy", mean(accuracy_list)) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_code_path", help="the local path of run code") + parser.add_argument("--single_server_mode", action="store_true", help="the local path of run code") + parser.add_argument("--action", default="run", choices=["run", "stop"], help="action (run or stop)") + parser.add_argument("--modelarts_version", default="V1", choices=["V1", "V2"], help="modelarts version (V1 or V2)") + parser.add_argument("--job_id", default="None", help="job id used to stop given job") + args = parser.parse_args() + return args + +if __name__ == '__main__': + args = get_args() + + logger.setLevel(logging.DEBUG) + session_config = session_config_v1 if args.modelarts_version == 'V1' else session_config_v2 + + handler = modelarts_handler() if args.modelarts_version == 'V1' else modelarts_handler_v2() + handler.create_session(access_config) + + if args.action == "stop": + if args.modelarts_version == 'V1': + handler.stop_new_versions(session_config) + else: + handler.stop_job(args.job_id) + sys.exit() + + handler.create_obs_handler(access_config) + + # default run mode + handler.run_job(session_config, args.local_code_path) + + # handler.output_url = "s3://0923/00lcm/result_dump/res/V212/" + try: + if args.single_server_mode: + report_result_singlesever_mode(handler, session_config.train_instance_count) + else: + report_result(handler) + except FileNotFoundError as e: + print("error resport result failed. Exception:", e) diff --git a/huawei/ais-bench_workload/src/train/huawei/common/mindspore_env.sh b/huawei/ais-bench_workload/src/train/huawei/common/mindspore_env.sh new file mode 100644 index 0000000000000000000000000000000000000000..e0ccc28ae7849ae1a44c9fe78a9a4157d20e07f0 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/common/mindspore_env.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +export GLOG_v=3 +export HCCL_CONNECT_TIMEOUT=600 + +if [ -f /usr/local/Ascend/nnae/set_env.sh ];then + source /usr/local/Ascend/nnae/set_env.sh +elif [ -f /usr/local/Ascend/ascend-toolkit/set_env.sh ]; then + source /usr/local/Ascend/ascend-toolkit/set_env.sh +elif [ -f ~/Ascend/nnae/set_env.sh ]; then + source ~/Ascend/nnae/set_env.sh +elif [ -f ~/Ascend/ascend-toolkit/set_env.sh ]; then + source ~/Ascend/ascend-toolkit/set_env.sh +else + echo "warning find no env so not set" +fi diff --git a/huawei/ais-bench_workload/src/train/huawei/common/tensorflow_env.sh b/huawei/ais-bench_workload/src/train/huawei/common/tensorflow_env.sh new file mode 100644 index 0000000000000000000000000000000000000000..62f348c2d9434d51e01ba823510388cf4c1c3e4a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/common/tensorflow_env.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Please change to the actual installation path +export install_path=/usr/local/Ascend + +# driver +export LD_LIBRARY_PATH=${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:$LD_LIBRARY_PATH + +if [ -d /usr/local/Ascend/nnae/latest ];then + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/nnae/latest/compiler/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib + export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/fwkplugin/latest/fwkplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/compiler/python/site-packages/:/usr/local/Ascend/fwkplugin/latest/fwkplugin/python/site-packages + export PATH=$PATH:/usr/local/Ascend/nnae/latest/compiler/ccec_compiler/bin:/usr/local/mpirun4.0/bin + export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp + export TBE_IMPL_PATH=/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core +else + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib + export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/fwkplugin/latest/fwkplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/compiler/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkplugin/python/site-packages:$projectDir + export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/mpirun4.0/bin + export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/ + export TBE_IMPL_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core +fi + +# Ascend-dmi +export LD_LIBRARY_PATH=/usr/local/dcmi:${install_path}/toolbox/latest/Ascend-DMI/lib64:${LD_LIBRARY_PATH} +export PATH=${install_path}/toolbox/latest/Ascend-DMI/bin:${PATH} + +export JOB_ID=123456789 + +export HCCL_CONNECT_TIMEOUT=600 +export HCCL_WHITELIST_DISABLE=1 + +# log +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +/usr/local/Ascend/driver/tools/msnpureport -d 0 -g error +/usr/local/Ascend/driver/tools/msnpureport -d 4 -g error +export SLOG_PRINT_TO_STDOUT=0 + +#system env +ulimit -c unlimited diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..4505785b20e739a6e4555ddd2a178d000ae605f7 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/build.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + CONFIG_FILE=${CURDIR}//output/code/pretrain_config_Ascend_Boost.yaml + # update dataset_format to tfrecord, since 1.9 version + sed -i "s|dataset_format:.*|dataset_format: 'tfrecord'|g" "$CONFIG_FILE" + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + + mkdir -p ${CURDIR}/output/config + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/config.sh -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/modelarts_config.py -r ${CURDIR}//output/config/ + [ "$1" == "r1.3" ] && { cp ${CURDIR}/config/modelarts_config.py.r1.3 -r ${CURDIR}//output/config/modelarts_config.py; } + [ -d ${CURDIR}/doc ] && cp ${CURDIR}/doc -r ${CURDIR}/output/ + + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..5a835db5c64160112cdc8cf8607fe6fa3b796752 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/config.sh @@ -0,0 +1,18 @@ +export PYTHON_COMMAND=python3.7 +export TRAIN_DATA_PATH=/home/datasets/Bert-Dataset/ +export EVAL_DATA_PATH=/home/datasets/Bert-TestData/ + +export PRETRAIN_MODEL_PATH=/home/models/ms_bert_large.ckpt + +export EPOCH_SIZE=5 +export TRAIN_STEPS=12000 + +# 8p +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# options needed only if rank_size > 1 +export RANK_TABLE_FILE=/home/lcm/tool/rank_table_8p.json + +# needed only in cluster mode +# export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/modelarts_config.py b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/modelarts_config.py new file mode 100644 index 0000000000000000000000000000000000000000..dfc8b229d1470758d1caa0467e4e2557c6b285fc --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/modelarts_config.py @@ -0,0 +1,159 @@ +from easydict import EasyDict as ed + +# 该部分为认证信息,请向相关运维同事咨询并填写 +access_config = ed({ + # 登录需要的ak sk信息 + 'access_key': '', + 'secret_access_key': '', + # 连接OBS的服务地址。可包含协议类型、域名、端口号。(出于安全性考虑,建议使用https协议) + # 如果是计算中心,需要联系运维同事获取 + 'server': '', + # project_id/region_name: + # 项目ID/区域ID,获取方式参考链接 + # https://support.huaweicloud.com/api-iam/iam_17_0002.html + # 如果是计算中心,请咨询相关维护同事 + 'region_name': '', + 'project_id': '', + + # 如下配置针对计算中心等专有云 通用云不需要设置 设置为空 请咨询相关维护同事 + # 设置该信息后 需要设置相关的域名解析地址 + 'iam_endpoint': '', + 'obs_endpoint': '', + 'modelarts_endpoint': '', +}) + +session_config = ed({ + # 运行模型的传入超参 + 'hyperparameters': [ + # 模型配置文件,默认boost模式,不需要修改 + {'label': 'config_path', 'value': '../../pretrain_config_Ascend_Boost.yaml'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'label': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'label': 'distribute', 'value': 'true'}, + # epoch次数 必须关注 当前默认设置为5 训练的epoch数 + # 优先级低于train_steps,如果存在train_steps以此为准,否则以epoch_size为准 + {'label': 'epoch_size', 'value': '5'}, + # 训练step数 必须填写并审视 该值优先级高于train_steps数 + {'label': 'train_steps', 'value': '12000'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'label': 'enable_save_ckpt', 'value': 'true'}, + # 不需要修改 + {'label': 'enable_lossscale', 'value': 'true'}, + # 不需要修改 + {'label': 'do_shuffle', 'value': 'true'}, + # 不需要修改 + {'label': 'enable_data_sink', 'value': 'true'}, + # 不需要修改 + {'label': 'data_sink_steps', 'value': '100'}, + # 不需要修改 + {'label': 'accumulation_steps', 'value': '1'}, + # 保存ckpt的step数 注意 该值必须要跟step数保存一致 这样提高性能 + {'label': 'save_checkpoint_steps', 'value': '12000'}, + # 保存ckpt的个数 默认为1 不需要修改 + {'label': 'save_checkpoint_num', 'value': '1'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/enwiki_small/', + # obs代码路径 程序会自动拷贝到该路径 + 'code_dir': '/zgwtest/lcm_test/bert/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/bert/run_pretrain.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'MindSpore-1.3-cann_5.0.2-python3.7-euleros2.8-aarch64', + + # 资源参数类型主要包括如下2个值 train_instance_type和pool_id + # 不设置pool_id 默认是公共池 设置了就是专属资源池 + # 只设置pool_id 不设置train_instance_type 默认为专属资源池的默认类型 + # train_instance_type 在程序打印中有提示的 一般为如下四个值 分别对应 1卡 2卡 4卡 8卡 + # ['modelarts.kat1.xlarge', 'modelarts.kat1.2xlarge', 'modelarts.kat1.4xlarge', 'modelarts.kat1.8xlarge'] + # https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0191.html 该链接指示获取方法 + + # 专属资源池id 不是则为None + 'pool_id': None, + # 训练类型 如下为8卡 如果是专属资源池id设置,那么该类型需要设置为None + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练结点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type' : None, + # 'nas_share_addr' : None, + # 'nas_mount_path' : None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) + +session_config_v2 = ed({ + # 运行模型的传入超参 + 'parameters': [ + # 模型配置文件,默认boost模式,不需要修改 + {'name': 'config_path', 'value': '../../pretrain_config_Ascend_Boost.yaml'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'name': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'name': 'distribute', 'value': 'true'}, + # epoch次数 必须关注 当前默认设置为5 训练的epoch数 + # 优先级低于train_steps,如果存在train_steps以此为准,否则以epoch_size为准 + {'name': 'epoch_size', 'value': '5'}, + # 训练step数 必须填写并审视 该值优先级高于train_steps数 + {'name': 'train_steps', 'value': '12000'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'name': 'enable_save_ckpt', 'value': 'true'}, + # 不需要修改 + {'name': 'enable_lossscale', 'value': 'true'}, + # 不需要修改 + {'name': 'do_shuffle', 'value': 'true'}, + # 不需要修改 + {'name': 'enable_data_sink', 'value': 'true'}, + # 不需要修改 + {'name': 'data_sink_steps', 'value': '100'}, + # 不需要修改 + {'name': 'accumulation_steps', 'value': '1'}, + # 保存ckpt的step数 注意 该值必须要跟step数保存一致 这样提高性能 + {'name': 'save_checkpoint_steps', 'value': '12000'}, + # 保存ckpt的个数 默认为1 不需要修改 + {'name': 'save_checkpoint_num', 'value': '1'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/enwiki_small/', + # obs代码路径 程序会自动拷贝到该路径. 和boot_files一起用于复合参数 training_files + 'code_dir': '/zgwtest/lcm_test/bert/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/bert/run_pretrain.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'mindspore_1.3.0-cann_5.0.2-py_3.7-euler_2.8.3-aarch64', + + # pool_id不设置或者设置为None, 默认是公共资源池。 设置了就表示是专属资源池。在ModelArts管理控制台,单击左侧“专属资源池”,在专属资源池列表中可以查看专属资源池ID,类似poolc90f063b + 'pool_id': None, + # 训练类型,默认8卡。 train_instance_type 在程序打印中有提示的,请注意紧随“get valid train_instance_types:”之后的打印输出. 由modelarts.estimatorV2 类Estimator的接口get_train_instance_types()查询而来。 + # 请参见https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0431.html 该链接指示获取方法。注意不同云环境查询的结果不同 + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练节点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type' : None, + # 'nas_share_addr' : None, + # 'nas_mount_path' : None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/modelarts_config.py.r1.3 b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/modelarts_config.py.r1.3 new file mode 100644 index 0000000000000000000000000000000000000000..6e7725c3ddac138e091511303f2de217f89fc1ab --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/config/modelarts_config.py.r1.3 @@ -0,0 +1,159 @@ +from easydict import EasyDict as ed + +# 该部分为认证信息,请向相关运维同事咨询并填写 +access_config = ed({ + # 登录需要的ak sk信息 + 'access_key': '', + 'secret_access_key': '', + # 连接OBS的服务地址。可包含协议类型、域名、端口号。(出于安全性考虑,建议使用https协议) + # 如果是计算中心,需要联系运维同事获取 + 'server': '', + # project_id/region_name: + # 项目ID/区域ID,获取方式参考链接 + # https://support.huaweicloud.com/api-iam/iam_17_0002.html + # 如果是计算中心,请咨询相关维护同事 + 'region_name': '', + 'project_id': '', + + # 如下配置针对计算中心等专有云 通用云不需要设置 设置为空 请咨询相关维护同事 + # 设置该信息后 需要设置相关的域名解析地址 + 'iam_endpoint': '', + 'obs_endpoint': '', + 'modelarts_endpoint' : '', +}) + +session_config = ed({ + # 运行模型的传入超参 + 'hyperparameters': [ + # bert模型类型 默认large_acc模式 不需要修改 + {'label': 'bert_network', 'value': 'large_acc'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'label': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'label': 'distribute', 'value': 'true'}, + # epoch次数 必须关注 当前默认设置为5 训练的epoch数 + # 优先级低于train_steps,如果存在train_steps以此为准,否则以epoch_size为准 + {'label': 'epoch_size', 'value': '5'}, + # 训练step数 必须填写并审视 该值优先级高于train_steps数 + {'label': 'train_steps', 'value': '12000'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'label': 'enable_save_ckpt', 'value': 'true'}, + # 不需要修改 + {'label': 'enable_lossscale', 'value': 'true'}, + # 不需要修改 + {'label': 'do_shuffle', 'value': 'true'}, + # 不需要修改 + {'label': 'enable_data_sink', 'value': 'true'}, + # 不需要修改 + {'label': 'data_sink_steps', 'value': '100'}, + # 不需要修改 + {'label': 'accumulation_steps', 'value': '1'}, + # 保存ckpt的step数 注意 该值必须要跟step数保存一致 这样提高性能 + {'label': 'save_checkpoint_steps', 'value': '12000'}, + # 保存ckpt的个数 默认为1 不需要修改 + {'label': 'save_checkpoint_num', 'value': '1'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/enwiki_small/', + # obs代码路径 程序会自动拷贝到该路径 + 'code_dir': '/zgwtest/lcm_test/bert/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/bert/run_pretrain.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'MindSpore-1.3-cann_5.0.2-python3.7-euleros2.8-aarch64', + + # 资源参数类型主要包括如下2个值 train_instance_type和pool_id + # 不设置pool_id 默认是公共池 设置了就是专属资源池 + # 只设置pool_id 不设置train_instance_type 默认为专属资源池的默认类型 + # train_instance_type 在程序打印中有提示的 一般为如下四个值 分别对应 1卡 2卡 4卡 8卡 + # ['modelarts.kat1.xlarge', 'modelarts.kat1.2xlarge', 'modelarts.kat1.4xlarge', 'modelarts.kat1.8xlarge'] + # https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0191.html 该链接指示获取方法 + + # 专属资源池id 不是则为None + 'pool_id' : None, + # 训练类型 如下为8卡 如果是专属资源池id设置,那么该类型需要设置为None + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练结点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type' : None, + # 'nas_share_addr' : None, + # 'nas_mount_path' : None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) + +session_config_v2 = ed({ + # 运行模型的传入超参 + 'parameters': [ + # bert模型类型 默认large_acc模式 不需要修改 + {'name': 'bert_network', 'value': 'large_acc'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'name': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'name': 'distribute', 'value': 'true'}, + # epoch次数 必须关注 当前默认设置为5 训练的epoch数 + # 优先级低于train_steps,如果存在train_steps以此为准,否则以epoch_size为准 + {'name': 'epoch_size', 'value': '5'}, + # 训练step数 必须填写并审视 该值优先级高于train_steps数 + {'name': 'train_steps', 'value': '12000'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'name': 'enable_save_ckpt', 'value': 'true'}, + # 不需要修改 + {'name': 'enable_lossscale', 'value': 'true'}, + # 不需要修改 + {'name': 'do_shuffle', 'value': 'true'}, + # 不需要修改 + {'name': 'enable_data_sink', 'value': 'true'}, + # 不需要修改 + {'name': 'data_sink_steps', 'value': '100'}, + # 不需要修改 + {'name': 'accumulation_steps', 'value': '1'}, + # 保存ckpt的step数 注意 该值必须要跟step数保存一致 这样提高性能 + {'name': 'save_checkpoint_steps', 'value': '12000'}, + # 保存ckpt的个数 默认为1 不需要修改 + {'name': 'save_checkpoint_num', 'value': '1'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/enwiki_small/', + # obs代码路径 程序会自动拷贝到该路径. 和boot_files一起用于复合参数 training_files + 'code_dir': '/zgwtest/lcm_test/bert/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/bert/run_pretrain.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'mindspore_1.3.0-cann_5.0.2-py_3.7-euler_2.8.3-aarch64', + + # pool_id不设置或者设置为None, 默认是公共资源池。 设置了就表示是专属资源池。在ModelArts管理控制台,单击左侧“专属资源池”,在专属资源池列表中可以查看专属资源池ID,类似poolc90f063b + 'pool_id' : None, + # 训练类型,默认8卡。train_instance_type 在程序打印中有提示的,请注意紧随“get valid train_instance_types:”之后的打印输出. 由modelarts.estimatorV2 类Estimator的接口get_train_instance_types()查询而来。 + # 请参见https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0431.html 该链接指示获取方法。注意不同云环境查询的结果不同 + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练节点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type' : None, + # 'nas_share_addr' : None, + # 'nas_mount_path' : None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.10.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.10.patch new file mode 100644 index 0000000000000000000000000000000000000000..f64e7a0e1fcdc80aa7b7cc7a68d3c69336b92264 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.10.patch @@ -0,0 +1,182 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-04-10 14:42:15.600000000 +0800 ++++ code/pretrain_eval.py 2023-04-10 14:42:15.610000000 +0800 +@@ -32,7 +32,11 @@ + Predict function + ''' + devid = int(os.getenv('DEVICE_ID')) +- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) ++ + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,19 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-04-10 14:42:15.590000000 +0800 ++++ code/run_pretrain.py 2023-04-10 14:42:15.610000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -42,7 +43,7 @@ + from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper +-from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -192,8 +193,18 @@ + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -268,10 +279,85 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-10 14:42:15.590000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-10 14:42:15.610000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.3.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.3.patch new file mode 100644 index 0000000000000000000000000000000000000000..adad995e28f963408b11864e793affe492b8f6f5 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.3.patch @@ -0,0 +1,199 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-07-11 14:30:06.510000000 +0800 ++++ code/pretrain_eval.py 2022-07-11 14:30:06.530000000 +0800 +@@ -151,7 +151,19 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ") +- print(v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("BATCH_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-07-11 14:30:06.510000000 +0800 ++++ code/run_pretrain.py 2022-07-11 14:30:06.520000000 +0800 +@@ -39,9 +39,37 @@ + from src.utils import LossCallBack, BertLearningRate + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper +-from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + ++import time ++from mindspore.train.callback._callback import Callback ++ ++skip_pre_step_size = 1 ++skip_pre_step_time_sum = 0 ++real_start_time = 0.0 ++ ++class ThroughputRate(Callback): ++ def __init__(self): ++ super(ThroughputRate, self).__init__() ++ self.count = 0 ++ ++ def step_begin(self, run_context): ++ self.step_time = time.time() ++ if self.count == skip_pre_step_size: ++ global real_start_time ++ real_start_time = self.step_time ++ ++ def step_end(self, run_context): ++ step_seconds = (time.time() - self.step_time) ++ ++ global skip_pre_epoch_size ++ global skip_pre_step_time_sum ++ ++ if self.count < skip_pre_step_size: ++ skip_pre_step_time_sum += step_seconds ++ print("skip:{} of {} step_seconds:{} time_sum:{}".format(self.count, skip_pre_step_size, step_seconds, skip_pre_step_time_sum)) ++ self.count = self.count + 1 + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -157,8 +185,18 @@ + cfg.data_dir = cfg.data_path + cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.finetune_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -209,6 +247,7 @@ + + optimizer = _get_optimizer(cfg, net_with_loss) + callback = [TimeMonitor(cfg.data_sink_steps), LossCallBack(ds.get_dataset_size())] ++ callback.append(ThroughputRate()) + if cfg.enable_save_ckpt == "true" and cfg.device_id % min(8, device_num) == 0: + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.save_checkpoint_num) +@@ -250,10 +289,83 @@ + + model = Model(net_with_grads) + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ end_time = time.time() ++ data_sum = ((new_repeat_count - skip_pre_step_size) * cfg.data_sink_steps * cfg.batch_size) ++ throughput_rate = data_sum / (end_time - real_start_time) ++ print("train done starttime:{} real:{} endtime:{} train_step:{} skiptime:{} ds getdataset:{} new_repeat_count:{} data_sum:{} throughput_rate:{}".format( ++ start_time, real_start_time, end_time, cfg.train_steps, skip_pre_step_time_sum, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("BATCH_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-07-11 14:30:06.510000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-07-11 14:30:06.530000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val/eval_10k.tfrecord")): ++ config.data_file = os.path.join(base_path, "val/eval_10k.tfrecord") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.5.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.5.patch new file mode 100644 index 0000000000000000000000000000000000000000..2e184959d69e7dd0694e3127573c4d7a17504e5b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.5.patch @@ -0,0 +1,192 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-07-11 17:53:51.470000000 +0800 ++++ code/pretrain_eval.py 2022-07-11 17:53:51.480000000 +0800 +@@ -33,6 +33,10 @@ + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,18 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("BATCH_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-07-11 17:53:51.460000000 +0800 ++++ code/run_pretrain.py 2022-07-11 17:53:51.480000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -42,7 +43,7 @@ + from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper +-from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -158,8 +159,19 @@ + cfg.data_dir = cfg.data_path + cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) ++ + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -262,10 +274,85 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("BATCH_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2022-07-11 17:53:51.460000000 +0800 ++++ code/src/model_utils/config.py 2022-07-11 17:53:51.480000000 +0800 +@@ -196,6 +196,8 @@ + parser.add_argument("--config_path", type=get_abs_path, default="../../pretrain_config.yaml", + help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../..', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-07-11 17:53:51.460000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-07-11 17:53:51.480000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.7.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.7.patch new file mode 100644 index 0000000000000000000000000000000000000000..0178e6446fc5e9253eeaff8509d9be5e9088bc7a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.7.patch @@ -0,0 +1,182 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-05-22 21:56:20.710000000 +0800 ++++ code/pretrain_eval.py 2022-05-22 21:56:20.740000000 +0800 +@@ -32,7 +32,10 @@ + Predict function + ''' + devid = int(os.getenv('DEVICE_ID')) +- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +56,18 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-05-22 21:56:20.690000000 +0800 ++++ code/run_pretrain.py 2022-05-22 21:56:20.720000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -42,7 +43,7 @@ + from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper +-from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -158,8 +159,19 @@ + cfg.data_dir = cfg.data_path + cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) ++ + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -262,10 +274,86 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() ++ +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-05-22 21:56:20.700000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-05-22 21:56:20.730000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.8.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.8.patch new file mode 100644 index 0000000000000000000000000000000000000000..4932a62ffc79d3d60bf1c03a1e35bdf954c71dd2 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.8.patch @@ -0,0 +1,184 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-04-10 11:15:24.640000000 +0800 ++++ code/pretrain_eval.py 2023-04-10 11:15:24.660000000 +0800 +@@ -32,7 +32,11 @@ + Predict function + ''' + devid = int(os.getenv('DEVICE_ID')) +- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) ++ + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,19 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-04-10 11:15:24.640000000 +0800 ++++ code/run_pretrain.py 2023-04-10 11:15:24.650000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -42,7 +43,7 @@ + from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper +-from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -158,8 +159,18 @@ + cfg.data_dir = cfg.data_path + cfg.save_checkpoint_path = os.path.join(cfg.output_path, cfg.save_checkpoint_path) + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -262,10 +273,88 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) + + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ ++ + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() ++ +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-10 11:15:24.640000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-10 11:15:24.660000000 +0800 +@@ -99,6 +99,15 @@ + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") + + if pre_process: + pre_process() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..76c8d6941e74ad50e0144f4c8de3e3cc5d46bb6a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r1.9.patch @@ -0,0 +1,182 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-10-18 11:56:29.660000000 +0800 ++++ code/pretrain_eval.py 2022-10-18 11:56:29.676000000 +0800 +@@ -32,7 +32,11 @@ + Predict function + ''' + devid = int(os.getenv('DEVICE_ID')) +- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) ++ + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,18 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-10-18 11:56:29.660000000 +0800 ++++ code/run_pretrain.py 2022-10-18 11:56:29.676000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -42,7 +43,7 @@ + from src.utils import LossCallBack, BertLearningRate, EvalCallBack, BertMetric + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper +-from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -192,8 +193,18 @@ + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -268,10 +279,86 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-10-18 11:56:29.668000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-10-18 11:56:29.688000000 +0800 +@@ -99,6 +99,16 @@ + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + + if pre_process: + pre_process() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.0.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..932d168243efeb69768fc4c6af56e3529d7b445f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.0.patch @@ -0,0 +1,180 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-04-07 20:15:40.860000000 +0800 ++++ code/pretrain_eval.py 2023-04-07 20:15:40.880000000 +0800 +@@ -33,6 +33,10 @@ + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,18 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-04-07 20:15:40.850000000 +0800 ++++ code/run_pretrain.py 2023-04-07 20:15:40.870000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -43,6 +44,7 @@ + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -192,8 +194,18 @@ + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -268,10 +280,87 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() ++ +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-07 20:15:40.860000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-07 20:15:40.870000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.1.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.1.patch new file mode 100644 index 0000000000000000000000000000000000000000..932d168243efeb69768fc4c6af56e3529d7b445f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.1.patch @@ -0,0 +1,180 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-04-07 20:15:40.860000000 +0800 ++++ code/pretrain_eval.py 2023-04-07 20:15:40.880000000 +0800 +@@ -33,6 +33,10 @@ + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,18 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-04-07 20:15:40.850000000 +0800 ++++ code/run_pretrain.py 2023-04-07 20:15:40.870000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -43,6 +44,7 @@ + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -192,8 +194,18 @@ + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -268,10 +280,87 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() ++ +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-07 20:15:40.860000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-07 20:15:40.870000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.2.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.2.patch new file mode 100644 index 0000000000000000000000000000000000000000..932d168243efeb69768fc4c6af56e3529d7b445f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/modelarts_r2.2.patch @@ -0,0 +1,180 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-04-07 20:15:40.860000000 +0800 ++++ code/pretrain_eval.py 2023-04-07 20:15:40.880000000 +0800 +@@ -33,6 +33,10 @@ + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ # for modelarts mode eval after traing, should set alone and no need set device_id ++ #context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) ++ from mindspore.context import ParallelMode ++ context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE) + dataset = create_eval_dataset(cfg.batch_size, 1, data_dir=cfg.eval_data_dir, dataset_format=cfg.dataset_format) + net_for_pretraining = BertPretrainEval(bert_net_cfg) + net_for_pretraining.set_train(False) +@@ -53,6 +57,18 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ v = v.item() ++ import moxing as mox ++ from src.model_utils.device_adapter import get_device_num ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(cfg.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(v)) ++ accuracy_file1 = os.path.join(cfg.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(v)) ++ ranksize_file = os.path.join(cfg.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-04-07 20:15:40.850000000 +0800 ++++ code/run_pretrain.py 2023-04-07 20:15:40.870000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -43,6 +44,7 @@ + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num ++from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -192,8 +194,18 @@ + net_with_grads = BertNetworkMatchBucket(net_with_grads, bert_net_cfg.seq_length, cfg.bucket_list) + return net_with_grads + ++def modelarts_post_process(): ++ def is_ckpt(name): ++ if name.endswith('ckpt'): ++ return True ++ return False ++ ckpt_save_dir = os.path.join(cfg.save_checkpoint_path, 'ckpt_' + str(get_rank())) ++ if os.path.exists(ckpt_save_dir): ++ ckpts = list(filter(is_ckpt, os.listdir(ckpt_save_dir))) ++ ckpts.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) ++ cfg.eval_ckpt = os.path.join(ckpt_save_dir, ckpts[-1]) + +-@moxing_wrapper(pre_process=modelarts_pre_process) ++@moxing_wrapper(pre_process=modelarts_pre_process, post_process=modelarts_post_process) + def run_pretrain(): + """pre-train bert_clue""" + context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id) +@@ -268,10 +280,87 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ end_time = time.time() ++ data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ throughput_rate = data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} train_step:{} ds getdataset:{} new_repeat_count:{} data_sum:{} single_throughput_rate:{}".format( ++ start_time, end_time, cfg.train_steps, ds.get_dataset_size(), new_repeat_count, data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = cfg.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(cfg.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set OK") ++ else: ++ print("singleserver_mode not set") ++ + set_seed(0) + run_pretrain() ++ ++ if get_rank() == 0: ++ from pretrain_eval import MLM_eval ++ MLM_eval() ++ +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-07 20:15:40.860000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-07 20:15:40.870000000 +0800 +@@ -100,6 +100,16 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ # modelarts sdk fit ++ config.schema_file = None ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "ms_bert_large.ckpt")): ++ config.load_checkpoint_path = os.path.join(base_path, "ms_bert_large.ckpt") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_data_dir = os.path.join(base_path, "val") ++ + if pre_process: + pre_process() + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..3b91c0974790f67719ec70e5e7523a99204e4574 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/patch.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r2.3"; } + + modelzoo_sub_dir="mindspore/model_zoo/official/nlp/bert" + if [ "$branch_args" == "r1.1" ];then + branch="r1.1" + patch_file_name="r1.1" + commitid="9c133b6f709e12ed7085c31f028e7c925ee57828" + git_url="https://gitee.com/mindspore/mindspore.git" + elif [ "$branch_args" == "r1.2" ];then + branch="r1.2" + patch_file_name="r1.2" + commitid="cd002779dc5e2bc2da85b9a33e8950aa3bb50ed2" + git_url="https://gitee.com/mindspore/mindspore.git" + elif [ "$branch_args" == "r1.3" ];then + branch="r1.3" + patch_file_name="r1.3" + commitid="d9d4960262617d964d669ef8e3287daf347d5a7c" + git_url="https://gitee.com/mindspore/mindspore.git" + elif [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="a6cbc7bc9e23fd04b53a406e72ba87e88d7980d0" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r1.6" ];then + branch="r1.6" + patch_file_name="r1.6" + commitid="6496c699bd404076b12a6edcc40889dafaeb5285" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r1.7" ];then + branch="master" + patch_file_name="r1.7" + commitid="3406fdabaee92f1b22ce0703fa25befa3c40d18e" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r1.8" ];then + branch="r1.8" + patch_file_name="r1.8" + commitid="e68e09de9e97eeccc0804bc5f43f764a7a2bdbee" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r1.8" ];then + branch="master" + patch_file_name="r1.8" + commitid="b68b6bfa919465567d89bc7fdcf6d0e63967d5aa" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r1.9" ];then + branch="r1.9" + patch_file_name="r1.9" + commitid="5318681496ef9a37d337737325ad1b238ef75917" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r1.10" ];then + branch="r1.10" + patch_file_name="r1.10" + commitid="8f7331e6a846e7c306dc8ac30313d9f07cf6ee98" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/bert" + elif [ "$branch_args" == "r2.0" ];then + branch="r2.0" + patch_file_name="r2.0" + commitid="f211f336e8bee3cf531bcad5f611f408069c6f9f" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/Bert" + elif [ "$branch_args" == "r2.1" ];then + branch="r2.1" + patch_file_name="r2.1" + commitid="44f2dc18e9bd52c6bcadd18f6567817ad798f641" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/Bert" + elif [ "$branch_args" == "r2.2" ];then + branch="master" + patch_file_name="r2.2" + commitid="bb9ab4fdfb2fc205ffeb4dd671be77312908ef88" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/Bert" + elif [ "$branch_args" == "r2.3" ];then + branch="master" + patch_file_name="r2.3" + commitid="c94da0701a9ede6c93df4cd5fec88df7942a1dcc" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/Bert" + else + echo "bad parameters : $1" + return $ret_error + fi + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.10.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.10.patch new file mode 100644 index 0000000000000000000000000000000000000000..f9b4505929628d51c24b789fc707142c4a3bd5d2 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.10.patch @@ -0,0 +1,76 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-12-16 14:59:50.550000000 +0800 ++++ code/pretrain_eval.py 2022-12-16 14:59:50.580000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-12-16 14:59:50.530000000 +0800 ++++ code/run_pretrain.py 2022-12-16 14:59:50.560000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -44,6 +45,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + + def _set_bert_all_reduce_split(): +@@ -268,9 +275,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.5.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.5.patch new file mode 100644 index 0000000000000000000000000000000000000000..154dab22a7ea53c6fdde0197fee4057934b460b4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.5.patch @@ -0,0 +1,75 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2021-12-10 14:27:15.670000000 +0800 ++++ code/pretrain_eval.py 2021-12-10 14:27:15.680000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2021-12-10 14:27:15.670000000 +0800 ++++ code/run_pretrain.py 2021-12-10 14:27:15.670000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -45,6 +46,12 @@ + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) + ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -262,8 +269,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + if __name__ == '__main__': diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.6.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.6.patch new file mode 100644 index 0000000000000000000000000000000000000000..8874b514c097697e0b5ea0e85874e07549ffa4b6 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.6.patch @@ -0,0 +1,77 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-01-25 15:31:09.227309487 +0800 ++++ code/pretrain_eval.py 2022-01-25 15:31:09.235309621 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-01-25 15:31:09.227309487 +0800 ++++ code/run_pretrain.py 2022-01-25 15:31:09.235309621 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -44,7 +45,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -262,8 +268,32 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() ++ + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + if __name__ == '__main__': diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.7.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.7.patch new file mode 100644 index 0000000000000000000000000000000000000000..6ae1e5ab13a8c09ba568adbed0105c3986df92f8 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.7.patch @@ -0,0 +1,77 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-05-30 13:10:29.271296733 +0800 ++++ code/pretrain_eval.py 2022-05-30 13:10:29.283296877 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-05-30 13:10:29.271296733 +0800 ++++ code/run_pretrain.py 2022-05-30 13:10:29.283296877 +0800 +@@ -17,6 +17,8 @@ + python run_pretrain.py + """ + import os ++import time ++ + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -43,6 +45,13 @@ + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True ++ + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -262,8 +271,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + if __name__ == '__main__': diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.8.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.8.patch new file mode 100644 index 0000000000000000000000000000000000000000..ed4e8538e526d541bb2ed620cf564be14f871a0d --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.8.patch @@ -0,0 +1,78 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-07-07 09:44:09.800000000 +0800 ++++ code/pretrain_eval.py 2022-07-07 09:44:09.816000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-07-07 09:44:09.800000000 +0800 ++++ code/run_pretrain.py 2022-07-07 09:44:09.816000000 +0800 +@@ -17,6 +17,8 @@ + python run_pretrain.py + """ + import os ++import time ++ + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -43,6 +45,13 @@ + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True ++ + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -262,9 +271,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..627181e0ebc1bb826e88350740ef537faba3ab59 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r1.9.patch @@ -0,0 +1,80 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2022-10-18 11:29:34.308000000 +0800 ++++ code/pretrain_eval.py 2022-10-18 11:29:34.324000000 +0800 +@@ -53,9 +53,17 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + +- + if __name__ == "__main__": + DEVICE_ID = 0 + os.environ['DEVICE_ID'] = str(DEVICE_ID) +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2022-10-18 11:29:34.312000000 +0800 ++++ code/run_pretrain.py 2022-10-18 11:29:34.324000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -43,6 +44,13 @@ + from src.model_utils.config import config as cfg, bert_net_cfg + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True ++ + _current_dir = os.path.dirname(os.path.realpath(__file__)) + + +@@ -268,9 +276,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.0.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..2f8dd4647fabdfc81f1eb6c35c57859287a4af5f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.0.patch @@ -0,0 +1,77 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/pretrain_eval.py 2023-03-27 15:45:24.796000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/run_pretrain.py 2023-03-27 15:45:24.796000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -44,7 +45,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -268,9 +274,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.1.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.1.patch new file mode 100644 index 0000000000000000000000000000000000000000..2f8dd4647fabdfc81f1eb6c35c57859287a4af5f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.1.patch @@ -0,0 +1,77 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/pretrain_eval.py 2023-03-27 15:45:24.796000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/run_pretrain.py 2023-03-27 15:45:24.796000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -44,7 +45,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -268,9 +274,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.2.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.2.patch new file mode 100644 index 0000000000000000000000000000000000000000..2f8dd4647fabdfc81f1eb6c35c57859287a4af5f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.2.patch @@ -0,0 +1,77 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/pretrain_eval.py 2023-03-27 15:45:24.796000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/run_pretrain.py 2023-03-27 15:45:24.796000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -44,7 +45,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -268,9 +274,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.3.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.3.patch new file mode 100644 index 0000000000000000000000000000000000000000..417525e8f161f3d4095dd2d59da91a23b8264230 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/r2.3.patch @@ -0,0 +1,77 @@ +diff -Nur origin/pretrain_eval.py code/pretrain_eval.py +--- origin/pretrain_eval.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/pretrain_eval.py 2023-03-27 15:45:24.796000000 +0800 +@@ -53,6 +53,15 @@ + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ", v) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(v)) + print("==============================================================") + + +diff -Nur origin/run_pretrain.py code/run_pretrain.py +--- origin/run_pretrain.py 2023-03-27 15:45:24.776000000 +0800 ++++ code/run_pretrain.py 2023-03-27 15:45:24.796000000 +0800 +@@ -17,6 +17,7 @@ + python run_pretrain.py + """ + import os ++import time + import mindspore.communication.management as D + from mindspore.communication.management import get_rank + import mindspore.common.dtype as mstype +@@ -44,7 +45,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id, get_device_num + _current_dir = os.path.dirname(os.path.realpath(__file__)) +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _set_bert_all_reduce_split(): + """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" +@@ -268,9 +274,31 @@ + callback.append(eval_callback) + + model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer) ++ model.build(ds, sink_size=cfg.data_sink_steps, epoch=new_repeat_count) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(new_repeat_count, ds, callbacks=callback, + dataset_sink_mode=(cfg.enable_data_sink == "true"), sink_size=cfg.data_sink_steps) +- ++ all_data_sum = new_repeat_count * cfg.data_sink_steps * cfg.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(0) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..d72557eb7ae6ba01fa1ce4fba4f7bbe69989c190 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + : "${EVAL_DATA_PATH?EVAL_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + + [ -d $BASE_PATH/result ] && cp ${RESULT_PATH}/* -rf $BASE_PATH/result/ + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/modelarts_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/modelarts_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d9325671673385e7c0e7703108806af7288ee4e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/modelarts_run.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh + +init() +{ + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + logger_Info "init called" +} + +run_train() +{ + logger_Info "run_train called" + [ ! -f $CODE_PATH/code/ma-pre-start.sh ] && touch $CODE_PATH/code/ma-pre-start.sh + sed -i '/SINGLESERVER_MODE=/d' $CODE_PATH/code/ma-pre-start.sh + [[ $MODELARTS_VERSION ]]&&[[ $MODELARTS_VERSION == "V2" ]] && modelarts_version="V2" || modelarts_version="V1" + if [ "$SINGLESERVER_MODE" == "True" ];then + echo "now set singleserver_mode OK" + echo -e "\nexport SINGLESERVER_MODE=True" >> $CODE_PATH/code/ma-pre-start.sh + + ${PYTHON_COMMAND} -u ${CODE_PATH}/common/train_modelarts.py --local_code_path $CODE_PATH/code --single_server_mode --modelarts_version $modelarts_version || { logger_Warn "run train modelarts failed ret:$?";return 1; } + else + echo "now not set singleserver_mode" + ${PYTHON_COMMAND} -u ${CODE_PATH}/common/train_modelarts.py --local_code_path $CODE_PATH/code --modelarts_version $modelarts_version || { logger_Warn "run train modelarts failed ret:$?";return 1; } + fi + ${PYTHON_COMMAND} $CODE_PATH/ais_utils.py set_result "training" "result" "OK" +} + +run_eval() +{ + logger_Info "run_eval called" +} + +get_result() +{ + logger_Info "get_result called" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..3b03d7eb1927815f8a80517f4de9c4ca6115f413 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_bert/scripts/run_node.sh @@ -0,0 +1,113 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="true" || DISTRUTE_ENABLE="false" + + CONFIG_FILE=$WORK_PATH/code/pretrain_config_Ascend_Boost.yaml + + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/run_pretrain.py \ + --distribute=$DISTRUTE_ENABLE \ + --epoch_size=$EPOCH_SIZE \ + --enable_save_ckpt=true \ + --enable_lossscale=true \ + --do_shuffle=true \ + --enable_data_sink=true \ + --data_sink_steps=100 \ + --accumulation_steps=1 \ + --save_checkpoint_path=$RUN_PATH \ + --save_checkpoint_steps=$TRAIN_STEPS \ + --save_checkpoint_num=1 \ + --load_checkpoint_path=$PRETRAIN_MODEL_PATH \ + --data_dir=${TRAIN_DATA_PATH} \ + --device_id=${DEVICE_ID} \ + --device_num=${DEVICE_NUM} \ + --train_steps=${TRAIN_STEPS} \ + --config_path=$CONFIG_FILE + " + + export MS_DISABLE_REF_MODE=0 + export MS_ENABLE_FORMAT_MODE=0 + return 0 +} + +function get_eval_cmd() +{ + chipname=`npu-smi info -t board -i 0 -c 0 | grep 'Chip Name' | awk {'print $4'}` + CONFIG_FILE=$WORK_PATH/code/pretrain_config_Ascend_Boost.yaml + sed -i "s|eval_data_dir:.*|eval_data_dir: '$EVAL_DATA_PATH'|g" "$CONFIG_FILE" + sed -i "s|schema_file:.*|schema_file: null|g" "$CONFIG_FILE" + sed -i "s|eval_ckpt:.*|eval_ckpt: '$CHECKPOINT_PATH'|g" "$CONFIG_FILE" + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/pretrain_eval.py --config_path=$CONFIG_FILE" + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_file_valid ${PRETRAIN_MODEL_PATH} || { logger_Warn "PRETRAIN_MODEL_PATH:${PRETRAIN_MODEL_PATH} not valid" ; return 1; } + logger_Debug "PRETRAIN_MODEL_PATH path valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + + check_path_valid "${EVAL_DATA_PATH}" || { logger_Warn "EVAL_DATA_PATH:${EVAL_DATA_PATH} not valid path" ; return 1; } + logger_Debug "EVAL_DATA_PATH is valid" +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + cp $CHECKPOINT_PATH $RESULT_PATH/ + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..065998ce7d27c07c80771ff446961caaac88439b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..8091e851f0e2d95b77e873eba9604340efeb11f9 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/config/config.sh @@ -0,0 +1,16 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7 +export TRAIN_DATA_PATH=/home/datasets/vocaug +export TRAIN_DATA_FILE=/home/datasets/vocaug/vocaug_mindrecord/vocaug_mindrecord0 +export PRETRAIN_MODEL_PATH=/home/datasets/pretrain_model/deeplabv3/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt +export EVAL_DATA_FILE_PATH=/home/datasets/vocaug/voc_val_lst.txt +export EPOCH_SIZE=200 + +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# need if rank_size > 1 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_64.json +# cluster need for node info +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + diff --git "a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/doc/ais-bench+mindspore-deeplabv3\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/doc/ais-bench+mindspore-deeplabv3\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000000000000000000000000000000000..a98a0fc83ac953bbe62f97cbfd0402d7da46b7ae --- /dev/null +++ "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/doc/ais-bench+mindspore-deeplabv3\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,75 @@ +# Ais-Bench+Mindspore+deeplabv3使用说明 + +## 简介 + +AI Server Benchmark 是按《信息技术 人工智能 服务器系统性能测试规范》对人工智能服务器系统的性能进行性能评估的测试系统(测试套件),简称Ais-Bench软件。 + +## 使用前提 + +本程序包运行需要基于以下前提 + +1. Atlas 800-9000设备 +2. 安装好CANN包和Mindspore对应版本。并可以运行正常mindspore测试程序。 +3. 保存数据集和相关预处理文件等到设备中。 + +## 集群节点配置 + +如果运行设备大于1个设备,那么需要运行设置ssh节点文件。说明节点信息 +{ +"cluster": { +"xx.xx.xx.xx": { # 节点ip 必须与ranktable中对应 +"user": "xxxx", # 用户名 免密可以不用设置 +"pd": "xx", # 密码 免密不用设置 +"port": xx # 端口 默认22 可以不用设置 +}, +"xx.xx.xx.xx": { +"user": "xxxx", +"pd": "xx", +"port": xx +} +} +} + +## 集群节点免密设置 + +设置密钥认证的参考操作如下: + ++ ssh-keygen -t rsa -b 2048 # 登录管理节点并生成SSH Key。安全起见,建议用户到"Enter passphrase"步骤时输入密钥密码,且符合密码复杂度要求。建议执行这条命令前先将umask设置为0077,执行完后再恢复原来umask值。 + ++ ssh-copy-id -i ~/.ssh/id_rsa.pub ``@`` # 将管理节点的公钥拷贝到所有节点的机器上,``@``替换成要拷贝到的对应节点的账户和ip。 + ++ 设置ssh代理管理ssh密钥,避免工具批量安装操作过程中输入密钥密码和节点密码 + +``` +ssh-agent bash # 开启ssh-agent的bash进程 +``` + + +``` +ssh-add # 向ssh-agent添加私钥 +``` + + +## 配置文件信息 + +> ``` +> export PYTHON_COMMAND=python3.7 +> export TRAIN_DATA_PATH=/home/datasets/VOCdevkit/VOC2012 +> export TRAIN_DATA_FILE=/home/datasets/VOCdevkit/VOC2012/dataset/mindrecored_deeplabv3.mindrecord0 +> export PRETRAIN_MODEL_PATH=/home/datasets/pretrain_model/deeplabv3/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt +> export EVAL_DATA_FILE_PATH=/home/datasets/VOCdevkit/VOC2012/voc_val_lst.txt +> export EPOCH_SIZE=200 +> +> export RANK_SIZE=8 +> export DEVICE_NUM=8 +> +> # need if rank_size > 1 +> export RANK_TABLE_FILE=/home/tools/rank_table_8p_64.json +> # cluster need for node info +> #export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json +> ``` + +说明: +配置文件默认是8卡训练。 +单卡训练时,需要设置RANK_SIZE=1,DEVICE_NUM=1,且不能使用RANK_TABLE_FILE环境变量. +同时还请按需增加指定执行卡序号变量声明export SINGLE_CARD_INDEX。默认 SINGLE_CARD_INDEX=0,可以不显式声明。其它卡时需要显式声明,比如export SINGLE_CARD_INDEX=6 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..de43867ed395996bfb06ff18a82a6db30461d7d6 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/patch.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="abc34438588942642e45e7cf1e516134952a2f86" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/deeplabv3" + elif [ "$branch_args" == "r1.6" ];then + branch="master" + patch_file_name="r1.6" + commitid="a58deaa4745a71fef902b73ed220054b6c072f24" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/deeplabv3" + elif [ "$branch_args" == "r1.9" ];then + branch="master" + patch_file_name="r1.9" + commitid="adccb235dc4d0f00a8d9abd6cfcc2fc43c83570b" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/deeplabv3" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..4072566c26cfe315a3928d64cbd1b776d5c3dad0 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/r1.9.patch @@ -0,0 +1,56 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-09-05 17:24:42.232000000 +0800 ++++ code/eval.py 2022-09-05 17:24:42.244000000 +0800 +@@ -247,6 +247,16 @@ + iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) + print('per-class IoU', iu) + print('mean IoU', np.nanmean(iu)) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ print("ACC_DIR:", ACC_DIR) ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(np.nanmean(iu))) + + + if __name__ == '__main__': +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-09-05 17:24:42.240000000 +0800 ++++ code/train.py 2022-09-05 17:24:42.248000000 +0800 +@@ -201,8 +201,31 @@ + keep_checkpoint_max=args.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck) + cbs.append(ckpoint_cb) ++ model.build(dataset, sink_size=dataset.get_dataset_size(), epoch=args.train_epochs) + ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(args.train_epochs, dataset, callbacks=cbs, dataset_sink_mode=(args.device_target != "CPU")) ++ all_data_sum = args.train_epochs * dataset.get_dataset_size() * args.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum / (end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + if __name__ == '__main__': + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2bdca78d3eaa658e65fe15ee31a11fab400c2e16 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/cluster_offline_run.sh @@ -0,0 +1,78 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_FILE?TRAIN_DATA_FILE not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..a6961186199b141af641c48ea54f1efa2c99bcca --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deeplabv3/scripts/run_node.sh @@ -0,0 +1,106 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# ȡѵ +function get_train_cmd() +{ + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py --data_file=$TRAIN_DATA_FILE \ + --train_dir=$RUN_PATH \ + --train_epochs=$EPOCH_SIZE \ + --batch_size=32 \ + --crop_size=513 \ + --base_lr=0.015 \ + --lr_type=cos \ + --min_scale=0.5 \ + --max_scale=2.0 \ + --ignore_label=255 \ + --num_classes=21 \ + --model=deeplab_v3_s16 \ + --ckpt_pre_trained=$PRETRAIN_MODEL_PATH \ + --save_steps=1500 \ + --keep_checkpoint_max=200 + " + return 0 +} + +function get_eval_cmd() +{ + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --data_root=$TRAIN_DATA_PATH \ + --data_lst=$EVAL_DATA_FILE_PATH \ + --batch_size=32 \ + --crop_size=513 \ + --ignore_label=255 \ + --num_classes=21 \ + --model=deeplab_v3_s16 \ + --scales_type=0 \ + --freeze_bn=True \ + --ckpt_path=$CHECKPOINT_PATH \ + " + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # ͨü Ҫ PYTHON_COMMAND RANK_SIZERANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # ǷװӦ + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_file_valid "${TRAIN_DATA_FILE}" || { logger_Warn "TRAIN_DATA_FILE:${TRAIN_DATA_FILE} not valid file" ; return 1; } + logger_Debug "TRAIN_DATA_FILE is valid" +} + + +function node_train() +{ + # ͨѵӿ + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..065998ce7d27c07c80771ff446961caaac88439b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..419eba6326f20d842ade991903e8dbcde9c9f7a1 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/config/config.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7 +export DEVICE_TARGET='CPU' +export RANK_SIZE=1 +export DEVICE_NUM=1 + diff --git "a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/doc/ais-bench+mindspore-deepspeechv2\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/doc/ais-bench+mindspore-deepspeechv2\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000000000000000000000000000000000..6f346a36fd09bf6af0c3cf3486597ba74afb64ec --- /dev/null +++ "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/doc/ais-bench+mindspore-deepspeechv2\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,109 @@ +# Ais-Bench+Mindspore+deepspeechv2使用说明 + +## 1.简介 + +AI Server Benchmark 是按《信息技术 人工智能 服务器系统性能测试规范》对人工智能服务器系统的性能进行性能评估的测试系统(测试套件),简称Ais-Bench软件。 + +## 2.使用前提 + +本程序包运行需要基于以下前提 + +1. Atlas 800-9000设备 +2. 安装好CANN包和Mindspore对应版本。并可以运行正常mindspore测试程序。 +3. 保存数据集和相关预处理文件等到设备中。 + +## 3.集群节点配置 + +### 3.1 rank_table文件 + +单机或集群rank_table文件生成方法,请参照[这里](https://gitee.com/mindspore/models/tree/master/utils/hccl_tools#merge_hccl)。 + +示例:rank_table_16p_64_66.json + +```bash +{ + "version": "1.0", + "server_count": "2", + "server_list": [ + { + "server_id": "xx.xx.xx.xx", + "device": [ + {"device_id": "0", "device_ip": "xx.xx.xx.xx", "rank_id": "0"}, + {"device_id": "1", "device_ip": "xx.xx.xx.xx", "rank_id": "1"}, + {"device_id": "2", "device_ip": "xx.xx.xx.xx", "rank_id": "2"}, + {"device_id": "3", "device_ip": "xx.xx.xx.xx", "rank_id": "3"}, + {"device_id": "4", "device_ip": "xx.xx.xx.xx", "rank_id": "4"}, + {"device_id": "5", "device_ip": "xx.xx.xx.xx", "rank_id": "5"}, + {"device_id": "6", "device_ip": "xx.xx.xx.xx", "rank_id": "6"}, + {"device_id": "7", "device_ip": "xx.xx.xx.xx", "rank_id": "7"} + ], + "host_nic_ip": "reserve" + }, + { + "server_id": "xx.xx.xx.xx", + "device": [ + {"device_id": "0", "device_ip": "xx.xx.xx.xx", "rank_id": "8"}, + {"device_id": "1", "device_ip": "xx.xx.xx.xx", "rank_id": "9"}, + {"device_id": "2", "device_ip": "xx.xx.xx.xx", "rank_id": "10"}, + {"device_id": "3", "device_ip": "xx.xx.xx.xx", "rank_id": "11"}, + {"device_id": "4", "device_ip": "xx.xx.xx.xx", "rank_id": "12"}, + {"device_id": "5", "device_ip": "xx.xx.xx.xx", "rank_id": "13"}, + {"device_id": "6", "device_ip": "xx.xx.xx.xx", "rank_id": "14"}, + {"device_id": "7", "device_ip": "xx.xx.xx.xx", "rank_id": "15"} + ], + "host_nic_ip": "reserve" + } + ], + "status": "completed" +} +``` + +### 3.2 ssh节点文件 + +运行设备大于1个设备,则需要运行设置ssh节点文件。说明节点信息 +示例:ssh64_66.json + +```bash +{ + "cluster": { + "xx.xx.xx.xx": { # 节点ip 必须与ranktable中的server_id一一对应 + "user": "xxxx", # 用户名 免密可以不用设置 + "pd": "xxxx", # 密码 免密不用设置 + "port": xx # 容器端口,默认22。可以不设置。本行缺失时,表示测试在该节点本地(非容器)运行,设置时表示在容器中运行并提供指定端口访问能力 + }, + "xx.xx.xx.xx": { + "user": "xxxx", + "pd": "xxxx", + "port": xx + } + } +} +``` + +注意:该文件中的节点数目应与rank_table中的的节点数目一致。 + +## 4.集群节点免密设置 + +设置密钥认证的参考操作如下: + ++ ssh-keygen -t rsa -b 2048 # 登录管理节点并生成SSH Key。安全起见,建议用户到"Enter passphrase"步骤时输入密钥密码,且符合密码复杂度要求。建议执行这条命令前先将umask设置为0077,执行完后再恢复原来umask值。 ++ ssh-copy-id -i ~/.ssh/id_rsa.pub ``@`` # 将管理节点的公钥拷贝到所有节点的机器上,``@``替换成要拷贝到的对应节点的账户和ip。 ++ 设置ssh代理管理ssh密钥,避免工具批量安装操作过程中输入密钥密码和节点密码 + +``` + ssh-agent bash # 开启ssh-agent的bash进程 + ssh-add # 向ssh-agent添加私钥 +``` + +## 5.配置文件信息 + +> #!/bin/bash +> export PYTHON_COMMAND=python3.7 +> export DEVICE_TARGET='CPU' +> export RANK_SIZE=1 +> export DEVICE_NUM=1 + +说明: +配置文件默认是8卡训练。 +单卡训练时,需要设置RANK_SIZE=1,DEVICE_NUM=1,且不能使用RANK_TABLE_FILE环境变量. +同时还请按需增加指定执行卡序号变量声明export SINGLE_CARD_INDEX。默认 SINGLE_CARD_INDEX=0,可以不显式声明。其它卡时需要显式声明,比如export SINGLE_CARD_INDEX=6 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea28dc0a92b9cfe952f1062d37d6545ff1935267 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="abc34438588942642e45e7cf1e516134952a2f86" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/research/audio/deepspeech2" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2fc1ff6e4b4bef4274e6e7de99cd513fa94e1909 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/cluster_offline_run.sh @@ -0,0 +1,77 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..79af49a2cb50bbfde64630b2ef37a92035a7df18 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_deepspeech2/scripts/run_node.sh @@ -0,0 +1,81 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + cp $WORK_PATH/code/labels.json $RUN_PATH/ + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py --device_target=$DEVICE_TARGET " + return 0 +} + +function get_eval_cmd() +{ + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --pretrain_ckpt $CHECKPOINT_PATH \ + --device_target=$DEVICE_TARGET \ + " + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" +} + + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..065998ce7d27c07c80771ff446961caaac88439b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..16c8008364a951ace48fc2b08edeef2f5c372691 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/config/config.sh @@ -0,0 +1,16 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7 +export TRAIN_DATA_PATH=/home/datasets/coco +export MINDRECORD_PATH=/home/datasets/coco/mindrecord_coco_train +export PRETRAIN_MODEL_PATH=/home/datasets/pretrain_model/faster_rcnn/pretrained_model.ckpt +export VALIDATION_JSON_FILE=/home/datasets/coco/annotations/instances_val2017.json +export EPOCH_SIZE=20 + +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# need if rank_size > 1 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_66.json +# cluster need for node info +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + diff --git "a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/doc/ais-bench+mindspore-deeplabv3\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/doc/ais-bench+mindspore-deeplabv3\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000000000000000000000000000000000..efa2897b26b39ebe92a3d58542d9b00611f98c2d --- /dev/null +++ "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/doc/ais-bench+mindspore-deeplabv3\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,72 @@ +# Ais-Bench+Mindspore+deeplabv3使用说明 + +## 简介 + +AI Server Benchmark 是按《信息技术 人工智能 服务器系统性能测试规范》对人工智能服务器系统的性能进行性能评估的测试系统(测试套件),简称Ais-Bench软件。 + +## 使用前提 + +本程序包运行需要基于以下前提 + +1. Atlas 800-9000设备 +2. 安装好CANN包和Mindspore对应版本。并可以运行正常mindspore测试程序。 +3. 保存数据集和相关预处理文件等到设备中。 +4. 支持Resnet50预训练的ckpt文件作为backbone.ckpt + +## 集群节点配置 + +如果运行设备大于1个设备,那么需要运行设置ssh节点文件。说明节点信息 +{ +"cluster": { +"xx.xx.xx.xx": { # 节点ip 必须与ranktable中对应 +"user": "xxxx", # 用户名 免密可以不用设置 +"pd": "xx", # 密码 免密不用设置 +"port": xx # 端口 默认22 可以不用设置 +}, +"xx.xx.xx.xx": { +"user": "xxxx", +"pd": "xx", +"port": xx +} +} +} + +## 集群节点免密设置 + +设置密钥认证的参考操作如下: + ++ ssh-keygen -t rsa -b 2048 # 登录管理节点并生成SSH Key。安全起见,建议用户到"Enter passphrase"步骤时输入密钥密码,且符合密码复杂度要求。建议执行这条命令前先将umask设置为0077,执行完后再恢复原来umask值。 ++ ssh-copy-id -i ~/.ssh/id_rsa.pub ``@`` # 将管理节点的公钥拷贝到所有节点的机器上,``@``替换成要拷贝到的对应节点的账户和ip。 ++ 设置ssh代理管理ssh密钥,避免工具批量安装操作过程中输入密钥密码和节点密码 + +``` +ssh-agent bash # 开启ssh-agent的bash进程 +``` + +``` +ssh-add # 向ssh-agent添加私钥 +``` + +## 配置文件信息 + +> ``` +> export PYTHON_COMMAND=python3.7 +> export TRAIN_DATA_PATH=/home/datasets/VOCdevkit/VOC2012 +> export TRAIN_DATA_FILE=/home/datasets/VOCdevkit/VOC2012/dataset/mindrecored_deeplabv3.mindrecord0 +> export PRETRAIN_MODEL_PATH=/home/datasets/pretrain_model/deeplabv3/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt +> export EVAL_DATA_FILE_PATH=/home/datasets/VOCdevkit/VOC2012/voc_val_lst.txt +> export EPOCH_SIZE=200 +> +> export RANK_SIZE=8 +> export DEVICE_NUM=8 +> +> # need if rank_size > 1 +> export RANK_TABLE_FILE=/home/tools/rank_table_8p_64.json +> # cluster need for node info +> #export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json +> ``` + +说明: +配置文件默认是8卡训练。 +单卡训练时,需要设置RANK_SIZE=1,DEVICE_NUM=1,且不能使用RANK_TABLE_FILE环境变量. +同时还请按需增加指定执行卡序号变量声明export SINGLE_CARD_INDEX。默认 SINGLE_CARD_INDEX=0,可以不显式声明。其它卡时需要显式声明,比如export SINGLE_CARD_INDEX=6 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..4123cfa0427b7e2993dd5ce293542311267e363e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/patch.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="abc34438588942642e45e7cf1e516134952a2f86" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/faster_rcnn" + elif [ "$branch_args" == "r1.9" ];then + branch="master" + patch_file_name="r1.9" + commitid="85ecbf257f70f7a5ff45640229c529a1c3690e97" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/faster_rcnn" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..5fe1effcd91bdf43cbf7e18c64c5b24fb8ed9676 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/r1.9.patch @@ -0,0 +1,71 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-09-21 10:05:25.304000000 +0800 ++++ code/eval.py 2022-09-21 10:05:25.312000000 +0800 +@@ -123,8 +123,18 @@ + eval_types = ["bbox"] + result_files = results2json(dataset_coco, outputs, "./results.pkl") + +- coco_eval(config, result_files, eval_types, dataset_coco, ++ result = coco_eval(config, result_files, eval_types, dataset_coco, + single_result=False, plot_detect_result=True) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ print("ACC_DIR:", ACC_DIR) ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(result)) + print("\nEvaluation done!") + + +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-09-21 10:05:25.308000000 +0800 ++++ code/train.py 2022-09-21 10:05:25.312000000 +0800 +@@ -35,6 +35,12 @@ + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_id ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + + def train_fasterrcnn_(): +@@ -225,8 +231,31 @@ + cb += [eval_cb] + + model = Model(net) ++ model.build(dataset, sink_size=dataset_size, epoch=config.epoch_size) ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size, dataset, callbacks=cb) ++ all_data_sum = config.epoch_size * dataset.get_dataset_size() * config.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum / (end_time - start_time) + ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + set_seed(1) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed6c8cdbd54fbfd81245116942538eeb6687bbc4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + + [ -d $BASE_PATH/result ] && cp ${RESULT_PATH}/* -rf $BASE_PATH/result/ + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..b851bf4224883bb94a555398595e5cd67459bd47 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_faster_rcnn/scripts/run_node.sh @@ -0,0 +1,104 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + COCO_CONFIG_FILE=$WORK_PATH/code/default_config.yaml + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py --config_path=$COCO_CONFIG_FILE \ + --coco_root=$TRAIN_DATA_PATH \ + --pre_trained=$PRETRAIN_MODEL_PATH \ + --backbone="resnet_v1.5_50" \ + --mindrecord_dir=$MINDRECORD_PATH \ + " + + # for mindspore1.5 + export ENV_FUSION_CLEAR=1 + export ENV_SINGLE_EVAL=1 + export SKT_ENABLE=1 + export DATASET_ENABLE_NUMA=True +} + +function get_eval_cmd() +{ + CONFIG_FILE=$WORK_PATH/code/default_config.yaml + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --config_path=$CONFIG_FILE \ + --device_id=0 \ + --anno_path=/home/datasets/coco/annotations/instances_val2017.json \ + --checkpoint_path=$WORK_PATH/code/scripts/train_parallel0/ckpt_0/faster_rcnn-20_7393.ckpt \ + --backbone=resnet_v1.5_50 \ + --coco_root=/home/datasets/coco \ + --mindrecord_dir=/home/datasets/coco/mindrecord_coco_train" + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + + # 检测是否安装对应框架软件 + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid file" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + + check_file_valid "${PRETRAIN_MODEL_PATH}" || { logger_Warn "PRETRAIN_MODEL_PATH:${PRETRAIN_MODEL_PATH} not valid file" ; return 1; } + logger_Debug "PRETRAIN_MODEL_PATH is valid" + + check_file_valid "${VALIDATION_JSON_FILE}" || { logger_Warn "VALIDATION_JSON_FILE:${VALIDATION_JSON_FILE} not valid file" ; return 1; } + logger_Debug "VALIDATION_JSON_FILE is valid" +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/README.md b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..846beb4ffb371b7c7d074757a4e6b0529bf4bf19 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/README.md @@ -0,0 +1,201 @@ +# 基于Mindspore/mindformers框架的glm2大模型训练负载使用指南 +本文主要介绍使用基于glm2 大模型训练业务代码构建的AISBench的负载包,进行服务器性能测试的流程。 +## 运行环境前置条件 +``` +python >= 3.7 +mindspore >= 2.2 +``` +MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行,验证命令: +```bash +python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" +``` +如果正常输出: +```bash +MindSpore version: 版本号 +The result of multiplication calculation is correct, MindSpore has been installed on platform [Ascend] successfully! +``` +说明成功。 + +## 负载包中文件夹主要目录结构 + +``` +├── ais-bench-stubs # Stubs主程序,负责流程控制、通信与数据管理等 +├── code # 业务代码目录 +│ ├── benchmark.sh # 入口脚本,会被ais-bench-stubs调用,调用业务代码,被测试者需要通过编写该脚本,对接运行的训练和推理脚本 +│ ├── config +│ │ ├── config.sh # 训练相关的配置文件,包括数据集、权重路径等信息 +│ ├── code # mindformers全部代码,嵌入了AISBench的打点上报接口 +│ │ └──mindformers +│ ├── cluster_offline_run.sh +│ ├── run_node.sh +│ ├── run_glm2_6b_finetune.yaml +│ └── run_glm2_6b_finetune_eval.yaml +├── config +│ ├── config.json # 测试配置文件,包含tester服务器信息、testerId等信息 +│ └── system.json # 被测试环境系统基本信息json文件,被测试者自行上传,比如硬件信息等 +├── dependencies # stubs的依赖组件 +│ ├── cluster # 分布式运行组件 +│ │ ├── ais_bench_cluster--py3-none-linux_.whl +│ │ ├── README.md +│ └── logging # 测试结果传输模块 +│ ├── ais_utils.py # 打点入口脚本,设置相关业务运行参数并反馈测试结果 +│ └── libais_utils.so # 测试结果传输模块lib,负责将测试结果传输到stubs模块相关文件部署 +├── log # 测试log日志。建议无需上传的日志文件,另建目录存放 +├── result # 测试结果文件。建议无需上传的结果文件,另建目录存放 +└── STUBS_PACKAGE_INTRO.md # Stubs被测试者接入使用文档 +``` +- **后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./ais-bench-stubs表示Stubs主程序** +- 运行环境安装mindformer需在code/code目录下执行`pip3 install .` +## 资源准备 +### 前置声明 +- 运行glm2训练的Mindspore/mindformers的代码全部在`./code/code`文件夹中,资源的准备参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md),具体资源的参考详见本章其他小节。 +- **注意**:需要确认环境中是否原来已经安装了mindformers,如果安装了,请使用`pip uninstall mindformers`卸载,确保负载代码的mindformers能正常安装。 +### rank_table_file准备 +- 确保`/etc/hccn.conf`文件已经配好(如果没配好,参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置)。 + +- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“生成RANK_TABLE_FILE”(单机多卡情况)章节。 + +### 模型权重下载与转换 +- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“模型权重下载与转换”章节; +- 资源链接: + - [glm2_6b.ckpt](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/XFormer_for_mindspore/glm2/glm2_6b.ckpt)(点击直接下载) + - [tokenizer](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/XFormer_for_mindspore/glm2/tokenizer.model)(点击直接下载) +- 下载后建议放至code/code/mindformers/checkpoint_download/glm2目录下(需手动创建checkpoint_download/glm2 如`mkdir -p code/code/mindformers/checkpoint_download/glm2`) +### 数据集准备 +- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“微调--数据集准备”章节; +- 资源链接: + - [ADGEN数据集](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)(下载后需解压) +- 下载解压后目录结构为: + ``` + AdvertiseGen + ├── train.json + └── dev.json + ``` +- 建议该目录放到code/code/mindformers/dataset_files/目录下(dataset_files需手动创建,如`mkdir -p code/code/mindformers/dataset_files/`) + +## 2 负载启动前配置项 +### 2.0 和tester连接的配置(仅在线测试需要) +`./config/config.json`和`./config/system.json`请参考《Stubs被测试者接入使用文档》中的“配置与Tester相关的配置文件”章节以及测试机构的要求进行配置。 +### 2.1 ./code/config/config.sh配置 +`./code/config/config.sh`内容如下: +```bash +#!/bin/bash +echo "set env of glm2 train" + +export PYTHON_COMMAND=python3 +# 以下cluster配置二选一,仅多机场景需要,目前glm2不支持多机,不涉及 +export CLUSTER_SSH_KEY_PATH=~/.ssh/id_rsa # 用户指定的ssh私钥,确保通过此私钥管理节点能免密访问所有计算节点(单机场景注释此行) +export CLUSTER_AUTO_SET_KEY='on' # 'off' or 'on', 若为'on' 不需要配置CLUSTER_SSH_KEY_PATH(单机场景注释此行) + +export GLM_RUN_MODE='only_finetune' + +# FINETUNE_CKPT_PATH, FINETUNE_DATA_PATH, EVAL_DATASET_PATH 这三个路径是相对mindformers源码的路径, 必须以./mindformers/开头 +# 可以在下载对应数据集完成后,将其复制到code/code/mindformers目录下,比如新建一个dataset_files存放解压后的AdvertiseGen +export FINETUNE_DATA_PATH=./mindformers/dataset_files/AdvertiseGen/train.json # 微调数据集实际路径 +export EVAL_DATASET_TYPE='ADGEN' # 'ADGEN' +export EVAL_DATASET_PATH=./mindformers/dataset_files/AdvertiseGen/dev.json # 评测用的数据集路径,必须以./mindformers/开头 +export FINETUNE_CKPT_PATH=./mindformers/checkpoint_download/glm2/glm2_6b.ckpt # 微调使用的预训练权重,必须以./mindformers/开头 +export EVAL_DEVICE_ID=0 # 评测用的npu 的device id + +export EPOCH_SIZE=1 +export GLM_LAYER_NUM=4 + +export RANK_SIZE=8 # 集群总加速卡数 +export DEVICE_NUM=8 # 集群每个节点的加速卡数 + +# parallel run params, parallel strategy config, DATA_PARALLEL * MODEL_PARALLEL * PIPELINE_STAGE should equal to RANK_SIZE +export DATA_PARALLEL=2 +export MODEL_PARALLEL=1 +export PIPELINE_STAGE=4 + +# need if rank_size > 1 +export RANK_TABLE_FILE=./hccl_xxxx_8p.json # 配置为生成的rank table路径,是相对于负载仓的code目录的路径,如果不在code目录下需要拷贝到code目录下 + +# 多机多卡需要配置,单机不需要配置,glm2不涉及 +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json +``` + +- 请参考`./code/config/config.sh`的注释将"资源准备"章节准备的资源的路径在`config.sh`中配置好, + +### 2.2 yaml配置 +- 前置声明:所有修改路径均为绝对路径 +- 需要修改code/run_glm2_6b_finetune_eval.yaml: +``` +train_dataset: &train_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/train.json" # 需要修改为实际AdvertiseGen/train.json路径 + shuffle: True + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" # 需要修改为实际tokenizer.model路径 + input_columns: ["input_ids", "labels"] + max_source_length: 64 + max_target_length: 128 + ignore_pad_token_for_loss: True + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 1 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +train_dataset_task: + type: KeyWordGenDataset + dataset_config: *train_dataset + +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/dev.json" # 需要修改为实际AdvertiseGen/dev.json路径 + shuffle: False + phase: "eval" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" # 需要修改为实际tokenizer.model路径 + max_source_length: 256 + max_target_length: 256 + ignore_pad_token_for_loss: True + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 1 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +eval_dataset_task: + type: KeyWordGenDataset + dataset_config: *eval_dataset +``` + +- 修改code/run_glm2_6b_finetune.yaml**同样需要修改上述的部分**,另外把文件开头的load_checkpoint设置为glm2的实际ckpt路径: +``` +seed: 0 +run_mode: 'train' +output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值 +load_checkpoint: 'glm2_6b.ckpt' # 修改为实际下载的glm2_6b.ckpt路径 +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +``` +## 3 负载启动 +### 3.1 在线测试 +执行命令 +```bash +./ais-bench-stubs +``` +### 3.2 轻量化离线测试 +执行命令 +```bash +./ais-bench-stubs test +``` \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..73312781cfdc218b22db844cc5fbcd42b77300f2 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/build.sh @@ -0,0 +1,33 @@ +#!/bin/bash +echo "start to build glm2 workload" + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + return $ret_ok +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + + mkdir -p ${CURDIR}/output/config + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/config.sh -r ${CURDIR}//output/config/ + [ -d ${CURDIR}/doc ] && cp ${CURDIR}/doc -r ${CURDIR}/output/ + + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..4d398c2e456bdcae78aa1389ff0558faf894227e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/config/config.sh @@ -0,0 +1,33 @@ +#!/bin/bash +echo "set env of glm train" + +export PYTHON_COMMAND=python3 +# 以下cluster配置二选一,仅多机场景需要 +export CLUSTER_SSH_KEY_PATH=~/.ssh/id_rsa # 用户指定的ssh私钥,确保通过此私钥管理节点能免密访问所有计算节点(单机场景注释此行) +export CLUSTER_AUTO_SET_KEY='on' # 'off' or 'on', 若为'on' 不需要配置CLUSTER_SSH_KEY_PATH(单机场景注释此行) + +export GLM_RUN_MODE='only_finetune' + +# FINETUNE_CKPT_PATH, FINETUNE_DATA_PATH, EVAL_DATASET_PATH 这三个路径是相对mindformers源码的路径, 必须以./mindformers/开头 +export FINETUNE_DATA_PATH=./mindformers/dataset_files/AdvertiseGen/train.json # 微调数据集 +export EVAL_DATASET_TYPE='ADGEN' # 'ADGEN' +export EVAL_DATASET_PATH=./mindformers/dataset_files/AdvertiseGen/dev.json # 评测用的数据集路径,必须以./mindformers/开头 +export FINETUNE_CKPT_PATH=./mindformers/checkpoint_download/glm2/glm2_6b.ckpt # 微调使用的预训练权重,必须以./mindformers/开头 +export EVAL_DEVICE_ID=0 # 评测用的npu 的device id + +export EPOCH_SIZE=1 +export GLM_LAYER_NUM=4 + +export RANK_SIZE=8 # 集群总加速卡数 +export DEVICE_NUM=8 # 集群每个节点的加速卡数 + +# parallel run params, parallel strategy config, DATA_PARALLEL * MODEL_PARALLEL * PIPELINE_STAGE should equal to RANK_SIZE +export DATA_PARALLEL=2 +export MODEL_PARALLEL=1 +export PIPELINE_STAGE=4 + +# need if rank_size > 1 +export RANK_TABLE_FILE=/home/hccl/hccl_xxxx_8p.json + +# 多机多卡需要配置,单机不需要配置 +# export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b4e8ebc7956666d75cfb0aa869cba10700bfa46 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r2.2"; } + + modelzoo_sub_dir="mindformers" + if [ "$branch_args" == "r2.2" ];then + branch="r0.8" + patch_file_name="r2.2" + commitid="c0f478fc517b1daec896f5c72bcea10b2ab83bd4" + git_url="https://gitee.com/mindspore/mindformers.git" + else + echo "bad parameters : $1" + return $ret_error + fi + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + local changed_code_path="$4" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + target_dir=$changed_code_path + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/r2.2.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/r2.2.patch new file mode 100644 index 0000000000000000000000000000000000000000..553076d839cfc0a399e997b61cda02e12eb0a9f9 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/r2.2.patch @@ -0,0 +1,231 @@ +diff -Nur origin/mindformers/core/callback/callback.py code/mindformers/core/callback/callback.py +--- origin/mindformers/core/callback/callback.py 2023-12-06 11:50:58.448000000 +0800 ++++ code/mindformers/core/callback/callback.py 2023-12-06 11:50:58.540000000 +0800 +@@ -33,7 +33,11 @@ + from mindformers.tools.register import MindFormerRegister, MindFormerModuleType + from mindformers.tools.cloud_adapter.cloud_adapter import Local2ObsMonitor + from mindformers.tools.logger import logger +-from mindformers.tools.utils import get_output_root_path, get_output_subpath, get_remote_save_url, check_in_modelarts ++from mindformers.tools.utils import get_output_root_path, get_output_subpath, get_remote_save_url, check_in_modelarts, save_aisbench_result ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + __all__ = ['ObsMonitor', 'MFLossMonitor', 'CheckpointMointor', 'SummaryMonitor', 'ProfileMonitor', 'EvalCallBack'] + +@@ -551,6 +555,7 @@ + + if save_ckpt: + logger.info('......Saving ckpt......') ++ save_aisbench_result("model_persistence_start_time", ais_utils.get_datatime().decode('utf-8')) + cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \ + + str(step_num_in_epoch) + ".ckpt" + # update checkpoint file list. +@@ -629,7 +634,7 @@ + self._config.async_save, {}, self._config.enc_key, self._config.enc_mode) + + save_only_network_params() +- ++ save_aisbench_result("model_persistence_end_time", ais_utils.get_datatime().decode('utf-8')) + self._latest_ckpt_file_name = cur_file + + +diff -Nur origin/mindformers/core/metric/metric.py code/mindformers/core/metric/metric.py +--- origin/mindformers/core/metric/metric.py 2023-12-06 11:50:58.448000000 +0800 ++++ code/mindformers/core/metric/metric.py 2023-12-06 11:50:58.540000000 +0800 +@@ -39,6 +39,11 @@ + + from .utils import PerplexityCell + from ...dataset.labels import cluener_labels ++from mindformers.tools.utils import save_aisbench_result ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + __all__ = ['EntityScore', 'SQuADMetric', 'PerplexityMetric', 'ADGENMetric', 'PromptAccMetric', 'EmF1Metric'] + +@@ -541,6 +546,12 @@ + return None + avg_loss = float(self.total_loss / self.num_data) + result = {"loss": avg_loss, "PPL": math.exp(avg_loss)} ++ result_log="loss: {}, Perplexity: {}".format(avg_loss, math.exp(avg_loss)) ++ try: ++ import ais_utils ++ ais_utils.set_result("training", "accuracy", result_log) ++ except Exception: ++ print("ais_utils not find") + if self.pipeline_parallel: + print("Average Loss and PPL Metric:", result) + return result +diff -Nur origin/mindformers/tools/transform_ckpt.py code/mindformers/tools/transform_ckpt.py +--- origin/mindformers/tools/transform_ckpt.py 2023-12-06 11:50:58.456000000 +0800 ++++ code/mindformers/tools/transform_ckpt.py 2023-12-06 11:50:58.548000000 +0800 +@@ -17,6 +17,11 @@ + import argparse + + import mindspore as ms ++from mindformers.tools.utils import save_aisbench_result ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + def get_strategy(startegy_path, rank_id=None): + """Merge strategy if strategy path is dir +@@ -84,6 +89,8 @@ + print(f"dst_ckpt_dir: {dst_ckpt_dir}") + print(f"prefix: {prefix}") + +- print("......Start transform......") ++ print("......Start transform......") # model_format_start_time ++ ais_utils.set_result("training", "model_format_start_time", ais_utils.get_datatime().decode('utf-8')) + ms.transform_checkpoints(src_ckpt_dir, dst_ckpt_dir, prefix, src_ckpt_strategy, dst_ckpt_strategy) +- print("......Transform succeed!......") ++ print("......Transform succeed!......") # model_format_end_time ++ ais_utils.set_result("training", "model_format_end_time", ais_utils.get_datatime().decode('utf-8')) +diff -Nur origin/mindformers/tools/utils.py code/mindformers/tools/utils.py +--- origin/mindformers/tools/utils.py 2023-12-06 11:50:58.456000000 +0800 ++++ code/mindformers/tools/utils.py 2023-12-06 11:50:58.548000000 +0800 +@@ -55,6 +55,28 @@ + _PROTOCOL = 'obs' + _PROTOCOL_S3 = 's3' + ++AISBENCH_RESULT_PATH=os.getenv('RESULT_PATH') ++ ++ ++def save_aisbench_result(rt_key:str, rt_value): ++ cur_rank_id = os.getenv("RANK_ID", "0") ++ cur_run_mode = os.getenv("LLAMA_CUR_RUN_MODE", "train") ++ result_file_path = os.path.join(AISBENCH_RESULT_PATH, f"result_rank_{cur_rank_id}.json") ++ if not os.path.exists(result_file_path): ++ with open(result_file_path, "w") as f: ++ init_data = {} ++ json.dump(init_data, f) ++ with open(result_file_path, "r") as result_file: ++ result_log = json.load(result_file) ++ if not cur_run_mode in result_log: ++ result_log[cur_run_mode] = {} ++ if rt_key in result_log[cur_run_mode]: ++ result_log[cur_run_mode][rt_key].append(rt_value) ++ else: ++ result_log[cur_run_mode][rt_key] = [rt_value] ++ with open(result_file_path, "w") as result_file: ++ json.dump(result_log, result_file) ++ + + def check_in_modelarts(): + """Check if the training is on modelarts. +diff -Nur origin/mindformers/trainer/base_trainer.py code/mindformers/trainer/base_trainer.py +--- origin/mindformers/trainer/base_trainer.py 2023-12-06 11:50:58.456000000 +0800 ++++ code/mindformers/trainer/base_trainer.py 2023-12-06 11:50:58.552000000 +0800 +@@ -42,7 +42,7 @@ + from mindformers.wrapper import build_wrapper + from mindformers.tools.register import MindFormerConfig + from mindformers.tools.logger import logger +-from mindformers.tools.utils import count_params ++from mindformers.tools.utils import count_params, save_aisbench_result + from mindformers.auto_class import AutoModel + from mindformers.pet import get_pet_model + from .config_args import ConfigArguments +@@ -51,6 +51,10 @@ + from .optimizer_grouped_parameters import get_optimizer_grouped_parameters + from .utils import set_seed, check_train_data_loader_type, \ + check_eval_data_loader_type, check_optimizer_and_lr_type, check_wrapper_config ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + SUPPORT_TASKS = MindFormerBook().get_trainer_support_task_list() + SUPPORT_MODEL_NAMES = MindFormerBook().get_model_name_support_list() +@@ -543,7 +547,8 @@ + load_resume_context_from_checkpoint(config) + + # build dataset +- logger.info(".........Build Dataset For Train..........") ++ logger.info(".........Build Dataset For Train..........") # dataload_start_time ++ save_aisbench_result("dataload_start_time", ais_utils.get_datatime().decode('utf-8')) + if dataset is None: + dataset = self.create_train_dataset() + self.set_train_dataset(dataset) +@@ -553,10 +558,12 @@ + * config.runner_config.sink_size / dataset.get_dataset_size()) + # pylint: disable=W0212 + dataset._dataset_helper = DatasetHelper(dataset, config.runner_config.sink_mode, +- config.runner_config.sink_size, epoch_num) ++ config.runner_config.sink_size, epoch_num) # dataload_end_time ++ save_aisbench_result("dataload_end_time", ais_utils.get_datatime().decode('utf-8')) + + # build network +- logger.info(".........Build Net For Train..........") ++ logger.info(".........Build Net For Train..........") # train_launch_start_time ++ save_aisbench_result("train_launch_start_time", ais_utils.get_datatime().decode('utf-8')) + eval_network = None + if network is None and wrapper is None and \ + self.model_wrapper is None and self.network is None: +@@ -651,18 +658,30 @@ + step_interval=config.eval_step_interval if config.eval_step_interval else 100, + epoch_interval=config.eval_epoch_interval if config.eval_epoch_interval else -1, + ) +- callbacks.append(eval_callback) ++ callbacks.append(eval_callback) # train_launch_end_time + + logger.info(".........Starting Training Model..........") + if int(os.getenv("RANK_ID", '0')) % 8 == 0: + pprint(config) + logger.info(".........Model Compiling, Please Wait a Moment...........") ++ model.build(dataset, None, sink_size=config.runner_config.sink_size, epoch=config.runner_config.epochs) ++ save_aisbench_result("train_launch_end_time", ais_utils.get_datatime().decode('utf-8')) ++ logger.info(".........Model Build Done, Traininf Start...........") ++ train_start_time = ais_utils.get_datatime() ++ save_aisbench_result("train_start_time", train_start_time.decode('utf-8')) # train_start_time + model.train(config.runner_config.epochs, dataset, + callbacks=callbacks, + dataset_sink_mode=config.runner_config.sink_mode, + sink_size=config.runner_config.sink_size, + initial_epoch=config.runner_config.initial_epoch) +- logger.info(".........Training Over!.............") ++ logger.info(".........Training Over!.............") # train_end_time ++ train_end_time = ais_utils.get_datatime() ++ all_data_sum = int(dataset.get_dataset_size() * config.train_dataset.batch_size / int(os.getenv("RANK_SIZE", '8'))) * \ ++ config.runner_config.origin_epochs * config.model.model_config.seq_length ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, train_start_time, train_end_time) ++ save_aisbench_result("train_end_time", train_end_time.decode('utf-8')) ++ save_aisbench_result("throughput_ratio", throughput_rate) ++ + + def evaluate_process( + self, +diff -Nur origin/requirements.txt code/requirements.txt +--- origin/requirements.txt 2023-12-06 11:50:58.396000000 +0800 ++++ code/requirements.txt 2023-12-06 11:50:58.492000000 +0800 +@@ -11,4 +11,5 @@ + pydantic==1.10.11 + mdtex2html + gradio +-opencv-python-headless +\ No newline at end of file ++opencv-python-headless ++pyyaml +\ No newline at end of file +diff -Nur origin/scripts/run_distribute.sh code/scripts/run_distribute.sh +--- origin/scripts/run_distribute.sh 2023-12-06 11:50:58.460000000 +0800 ++++ code/scripts/run_distribute.sh 2023-12-06 11:50:58.556000000 +0800 +@@ -179,7 +179,14 @@ + fi + fi + shopt -u extglob +- ++wait ++if [ $? -eq 0 ];then ++ echo "all train processes completed successfully!" ++ exit 0 ++else ++ echo "one or more train processes exited with a error!" ++ exit 1 ++fi + + #cd ./pretrain_parallel${START_DEVICE} || exit + #tail -f mindformer.log diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..fd370f6cec567013c144b6f69c039e166f25d4cd --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/benchmark.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 +declare -i ret_mode_failed=5 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) +export DEPEND_PATH=$BASE_PATH/dependencies/ + +function get_node_train_data() +{ + if [ "$GLM_RUN_MODE" = "only_finetune" ];then + [ ! -f $FINETUNE_CKPT_PATH ] || { echo "finetune base ckpt:$FINETUNE_CKPT_PATH";return $ret_failed; } + fi + return $ret_ok +} + +# 配置训练相关的环境变量 +source ${CODE_PATH}/config/config.sh || { logger_Warn "source file failed:$?";return $ret_init_failed; } + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh +if [ -d $FINETUNE_DATA_PATH ];then + cp -r $FINETUNE_DATA_PATH $CUR_PATH || { logger_Warn "ERROR: cp $FINETUNE_DATA_PATH failed!";return $ret_init_failed; } +fi + +. $CODE_PATH/cluster_offline_run.sh + +main(){ + get_node_train_data || { logger_Warn "download open glm cpkt failed:$?";return $ret_init_failed; } + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..07ff4dbfa10055c5e176b88b6d64d8e50eae3594 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/cluster_offline_run.sh @@ -0,0 +1,134 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common_2.0.sh +. $CODE_PATH/common/node_common.sh + +# env check +export RELAT_WORK_PATH=work +export RELAT_RESULT_PATH=$RELAT_WORK_PATH/result +CONFIG_FILE="config.sh" +# set nodes work path. 仅仅是管理节点的work/ +export WORK_PATH=${BASE_PATH}/work +# set nodes result path +export RESULT_PATH=${WORK_PATH}/result +local_env_cmd="source /etc/profile; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + export PYTHONPATH=$WORK_PATH:$PYTHONPATH; + export PYTHONPATH=$WORK_PATH/logging:$PYTHONPATH; + source $WORK_PATH/config/$CONFIG_FILE" +env_cmd="source /etc/profile; + export WORK_PATH=\$PWD/$RELAT_WORK_PATH; + export RESULT_PATH=\$PWD/$RELAT_RESULT_PATH; + export PYTHONPATH=\$WORK_PATH:\$PYTHONPATH; + export PYTHONPATH=\$WORK_PATH/logging:\$PYTHONPATH; + source \$WORK_PATH/config/$CONFIG_FILE" + +check_env() +{ + # check ranktable set + : "${RANK_SIZE?RANK_SIZE not set}" + : "${DEVICE_NUM?DEVICE_NUM not set}" + [[ $RANK_SIZE -eq 1 ]] || : "${RANK_TABLE_FILE?RANK_TABLE_FILE not set}" + [[ $RANK_SIZE -eq 1 ]] && [[ -n "$RANK_TABLE_FILE" ]] && { echo "ranksize=1 should not set RANK_TABLE_FILE";return 1; } + + # check python + : "${PYTHON_COMMAND?PYTHON_COMMAND not set}" + [ "$NODEINFO_FILE" == "" ] && { echo "NODEINFO_FILE not set, will not check cluster";return 0; } + if pip show ais_bench_cluster >/dev/null 2>&1;then + logger_Info "ais_bench cluster module exist, won't be installed again" + else + cluster_whl_path="${DEPEND_PATH}/cluster/ais_bench_cluster-*.whl" + if [ -f $cluster_whl_path ];then + pip install $cluster_whl_path --force-reinstall || { logger_Error "install cluster failed!";return 1; } + else + logger_Error "can't find ais_bench cluster wheel package" + fi + fi + + # check nodeinfofile exist + [[ $RANK_SIZE -le 8 ]] || check_file_valid "${NODEINFO_FILE}" || { echo "nodeinfofile:${NODEINFO_FILE} not valid" ; return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Error "source file failed:$?";return 1; } + if [ -d ${DEPEND_PATH}/logging ];then + cp -r ${DEPEND_PATH}/logging ${CODE_PATH} + fi + check_env || { logger_Error "env check failed'" ; return 1; } + + # init ais_bench.cluster + cluster_init || { logger_Error "ais_bench_cluster init failed!";return 1; } + + # refresh result path + rm -rf ${BASE_PATH}/result;mkdir -p ${BASE_PATH}/result + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + rm -rf $WORK_PATH;mkdir -p $WORK_PATH + + if [ "$NODEINFO_FILE" != "" ];then + cmd="rm -rf ${RELAT_WORK_PATH};mkdir -p ${RELAT_WORK_PATH}" + cluster_multi_exec "$cmd" serial || { logger_Error "renew workpath failed"; return 1; } + fi + + # copy code to node work path + cp -r $CODE_PATH/* $WORK_PATH # CPU可以执行的都在host节点执行 + + if [ "$NODEINFO_FILE" != "" ];then + # sync data if work_path not exist so new one.节点的work/ 路径是相对于在node_file中指定的work_path + cluster_multi_put "$WORK_PATH" "./" || { logger_Error "deploy code to work place failed"; return 1; } + fi + cmd="source /etc/profile; + export WORK_PATH=\$PWD/$RELAT_WORK_PATH; + source \$WORK_PATH/config/$CONFIG_FILE; + bash \$WORK_PATH/run_node.sh check" + cluster_multi_exec "$cmd" serial|| { return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + if [ "$GLM_RUN_MODE" == "only_finetune" ];then + if [ "$NODEINFO_FILE" == "" ];then + cmd="$local_env_cmd; + rm -rf $RESULT_PATH/*.json; + bash $WORK_PATH/run_node.sh train finetune " + else + cmd="$env_cmd; + rm -rf \$RESULT_PATH/*.json; + bash \$WORK_PATH/run_node.sh train finetune " + fi + cluster_multi_exec "$cmd" || { logger_Error "run train(finetune) failed"; return 1; } + if [ "$NODEINFO_FILE" != "" ];then + cluster_multi_get "$RELAT_RESULT_PATH" "$BASE_PATH" || { logger_Error "cp result between nodes failed"; return 1; } + else + cp -r $WORK_PATH/result $BASE_PATH + fi + export PYTHONPATH=$WORK_PATH/logging:$PYTHONPATH + bash $WORK_PATH/run_node.sh merge || { logger_Error "ckpt merge failed"; return 1; } + fi + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="$local_env_cmd; + bash $WORK_PATH/run_node.sh eval" + eval "$cmd" || { logger_Error "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + source ${CODE_PATH}/config/$CONFIG_FILE + export PYTHONPATH=${CODE_PATH}/logging:$PYTHONPATH + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_glm2_result.py ${BASE_PATH}/result ${RANK_SIZE} ${GLM_RUN_MODE} + find $BASE_PATH/result/ -name "*.ckpt" -exec rm {} \; + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_glm2_6b_finetune.yaml b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_glm2_6b_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a5ec8f398d796edce0116d199760862c63f32d6 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_glm2_6b_finetune.yaml @@ -0,0 +1,250 @@ +seed: 0 +run_mode: 'train' +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '../../mindformers/checkpoint_download/glm2_6b.ckpt' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False + +# ==== context config ==== +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "30GB" # 59GB for Atlas 800T A2 + save_graphs: False + device_id: 0 + +# aicc +remote_save_url: "Please input obs url on AICC platform." + +# ==== model config ==== +model: + model_config: + type: ChatGLM2Config + batch_size: 1 # only for incremental infer + num_layers: 28 + padded_vocab_size: 65024 + hidden_size: 4096 + ffn_hidden_size: 13696 + kv_channels: 128 + num_attention_heads: 32 + seq_length: 193 + hidden_dropout: 0.0 + attention_dropout: 0.0 + layernorm_epsilon: 1e-5 + rmsnorm: True + apply_residual_connection_post_layernorm: False + post_layer_norm: True + add_bias_linear: False + add_qkv_bias: True + bias_dropout_fusion: True + multi_query_attention: True + multi_query_group_num: 2 + apply_query_key_layer_scaling: True + attention_softmax_in_fp32: True + fp32_residual_connection: False + quantization_bit: 0 + pre_seq_len: None + prefix_projection: False + param_init_type: "float16" + compute_dtype: "float16" + layernorm_compute_type: "float32" + use_past: False + use_flash_attention: False # when use FlashAttention, seq_length should be multiple of 16 + eos_token_id: 2 + pad_token_id: 0 + repetition_penalty: 1.0 + max_decode_length: 256 + checkpoint_name_or_path: "glm2_6b" + top_k: 1 + top_p: 1 + do_sample: True + arch: + type: ChatGLM2ForConditionalGeneration + +trainer: + type: CausalLanguageModelingTrainer + model_name: 'glm2_6b' +# if True do, evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: 1788 +eval_epoch_interval: -1 + +metric: + type: PerplexityMetric + +processor: + return_tensors: ms + tokenizer: + type: ChatGLM2Tokenizer + bos_token: '' + eos_token: '' + end_token: '' + mask_token: '[MASK]' + gmask_token: '[gMASK]' + pad_token: '' + unk_token: '' + # vocab_file: "/path/to/tokenizer.model" + type: GLMProcessor + +# ==== dataset config ==== +train_dataset: &train_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "../../mindformers/dataset_files/AdvertiseGen/train.json" + shuffle: True + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "../../mindformers/checkpoint_download/tokenizer.model" + input_columns: ["input_ids", "labels"] + max_source_length: 64 + max_target_length: 128 + ignore_pad_token_for_loss: True + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +train_dataset_task: + type: KeyWordGenDataset + dataset_config: *train_dataset + +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "./code/mindformers/dataset_files/AdvertiseGen/dev.json" + shuffle: False + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "./code/mindformers/checkpoint_download/tokenizer.model" + max_source_length: 64 + max_target_length: 127 + ignore_pad_token_for_loss: True + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +eval_dataset_task: + type: KeyWordGenDataset + dataset_config: *eval_dataset + +# ==== runner config ==== +runner_config: + epochs: 1 + batch_size: 8 + sink_mode: True + sink_size: 4 + +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +# lr sechdule +lr_schedule: + type: polynomial + learning_rate: 5.e-5 + lr_end: 1.e-6 + warmup_steps: 0 + total_steps: -1 # -1 means it will load the total steps of the dataset +layer_scale: False +layer_decay: 0.65 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 + weight_decay: 0.1 +lr_scale: False +lr_scale_factor: 256 + +# parallel config +use_parallel: True +parallel: + parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid + gradients_mean: False + loss_repeated_mean: True + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True # optimizer shard + strategy_ckpt_config: + save_file: "./ckpt_strategy.ckpt" +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + expert_parallel: 1 + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +micro_batch_interleave_num: 1 + +# moe +moe_config: + expert_num: 1 + capacity_factor: 1.05 + aux_loss_factor: 0.05 + num_experts_chosen: 1 + +# recompute +recompute_config: + recompute: True + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# autotune +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +# profile +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: True +profile_communication: True +profile_memory: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "glm2-6b" + save_checkpoint_steps: 1000 + keep_checkpoint_max: 1 + integrated_save: False + async_save: False + - type: ObsMonitor + keep_last: False +eval_callbacks: + - type: ObsMonitor + keep_last: False diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_glm2_6b_finetune_eval.yaml b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_glm2_6b_finetune_eval.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1955ecf97d34bc444d896f8f896c3eb16c5a605e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_glm2_6b_finetune_eval.yaml @@ -0,0 +1,250 @@ +seed: 0 +run_mode: 'eval' +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: 'glm2_6b' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False + +# ==== context config ==== +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "30GB" # 59GB for Atlas 800T A2 + save_graphs: False + device_id: 0 + +# aicc +remote_save_url: "Please input obs url on AICC platform." + +# ==== model config ==== +model: + model_config: + type: ChatGLM2Config + batch_size: 8 # only for incremental infer + num_layers: 28 + padded_vocab_size: 65024 + hidden_size: 4096 + ffn_hidden_size: 13696 + kv_channels: 128 + num_attention_heads: 32 + seq_length: 256 + hidden_dropout: 0.0 + attention_dropout: 0.0 + layernorm_epsilon: 1e-5 + rmsnorm: True + apply_residual_connection_post_layernorm: False + post_layer_norm: True + add_bias_linear: False + add_qkv_bias: True + bias_dropout_fusion: True + multi_query_attention: True + multi_query_group_num: 2 + apply_query_key_layer_scaling: True + attention_softmax_in_fp32: True + fp32_residual_connection: False + quantization_bit: 0 + pre_seq_len: None + prefix_projection: False + param_init_type: "float16" + compute_dtype: "float16" + layernorm_compute_type: "float32" + use_past: True + eos_token_id: 2 + pad_token_id: 0 + repetition_penalty: 1.0 + max_decode_length: 256 + checkpoint_name_or_path: "glm2_6b" + top_k: 1 + top_p: 1 + do_sample: True + arch: + type: ChatGLM2ForConditionalGeneration + +trainer: + type: CausalLanguageModelingTrainer + model_name: 'glm2_6b' +# if True do, evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: 500 +eval_epoch_interval: -1 + +metric: + type: ADGENMetric + +processor: + return_tensors: ms + tokenizer: + type: ChatGLM2Tokenizer + bos_token: '' + eos_token: '' + end_token: '' + mask_token: '[MASK]' + gmask_token: '[gMASK]' + pad_token: '' + unk_token: '' + # vocab_file: "/path/to/tokenizer.model" + type: GLMProcessor + +# ==== dataset config ==== +train_dataset: &train_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/train.json" + shuffle: True + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" + input_columns: ["input_ids", "labels"] + max_source_length: 64 + max_target_length: 128 + ignore_pad_token_for_loss: True + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +train_dataset_task: + type: KeyWordGenDataset + dataset_config: *train_dataset + +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/dev.json" + shuffle: False + phase: "eval" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "./work/code/mindformers/checkpoint_download/tokenizer.model" + max_source_length: 256 + max_target_length: 256 + ignore_pad_token_for_loss: True + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +eval_dataset_task: + type: KeyWordGenDataset + dataset_config: *eval_dataset + +# ==== runner config ==== +runner_config: + epochs: 1 + batch_size: 8 + sink_mode: True + sink_size: 4 + +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +# lr sechdule +lr_schedule: + type: polynomial + learning_rate: 5.e-5 + lr_end: 1.e-6 + warmup_steps: 0 + total_steps: -1 # -1 means it will load the total steps of the dataset +layer_scale: False +layer_decay: 0.65 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 + weight_decay: 0.1 +lr_scale: False +lr_scale_factor: 256 + +# parallel config +use_parallel: True +parallel: + parallel_mode: 0 # 0-dataset, 1-semi, 2-auto, 3-hybrid + gradients_mean: False + loss_repeated_mean: True + enable_alltoall: False + full_batch: False + search_mode: "sharding_propagation" + enable_parallel_optimizer: True # optimizer shard + strategy_ckpt_config: + save_file: "./ckpt_strategy.ckpt" + only_trainable_params: False # 设置成 False,才能在策略文件中保存所有参数 +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + expert_parallel: 1 + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +micro_batch_interleave_num: 1 + +# moe +moe_config: + expert_num: 1 + capacity_factor: 1.05 + aux_loss_factor: 0.05 + num_experts_chosen: 1 + +# recompute +recompute_config: + recompute: True + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# autotune +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +# profile +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: True +profile_communication: True +profile_memory: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "glm2-6b" + save_checkpoint_steps: 1000 + keep_checkpoint_max: 1 + integrated_save: False + async_save: False + - type: ObsMonitor + keep_last: False +eval_callbacks: + - type: ObsMonitor + keep_last: False diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..713c278706934a7b0e2a1f0439b398f45c16ff42 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_glm2/scripts/run_node.sh @@ -0,0 +1,167 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +declare -i ret_ok=0 +declare -i ret_failed=1 + +GLM_RUN_YAML_NAME="run_glm2_6b_finetune.yaml" +GLM_EVAL_YAML_NAME="run_glm2_6b_finetune_eval.yaml" + +function get_node_rank_id_range() +{ + RANK_ID_RANGE="[0,8]" + # get server node id default is 0 + : "${NODE_ID:=0}" + # get rank start index + if [[ $DEVICE_NUM == 1 && $RANK_SIZE == 1 ]];then + : "${SINGLE_CARD_INDEX:=0}" + RANK_START=$SINGLE_CARD_INDEX + else + # get rank start index + RANK_START=`expr ${NODE_ID} \* $DEVICE_NUM` + fi + RANK_ID_MAX=$[DEVICE_NUM+RANK_START] + RANK_ID_RANGE="[$RANK_START,$RANK_ID_MAX]" +} + +function node_init() +{ + export PYTHONPATH=$WORK_PATH:$WORK_PATH/logging:$PYTHONPATH + + if [ $1 == "check" ];then + # install pyyaml + if pip show pyyaml >/dev/null 2>&1;then + logger_Info "pyyaml exist, won't be installed again" + else + pip_cmd="pip install pyyaml" + $pip_cmd || { logger_Warn "pyyaml install failed:$?";return $ret_failed; } + fi + # install mindformers + if pip show mindformers >/dev/null 2>&1;then + logger_Info "mindformers exist, won't be installed again" + else + cd $WORK_PATH/code + pip install . || { logger_Warn "mindformers install failed:$?";return $ret_failed; } + cd $WORK_PATH + fi + fi + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + get_node_rank_id_range + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + rank_table_path=${WORK_PATH}/${RANK_TABLE_FILE} + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$rank_table_path" || { logger_Warn "node common check failed" ; return $ret_failed; } + + # check_mindspore_run_ok_Ascend ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return $ret_failed; } + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return $ret_failed; } + logger_Debug "mindspore running successfully" + + if [ "$GLM_RUN_MODE" == "only_finetune" ];then + check_file_valid "${WORK_PATH}/code/${FINETUNE_DATA_PATH}" || { logger_Warn "FINETUNE_DATA_PATH:${FINETUNE_DATA_PATH} not valid path" ; return 1; } + logger_Debug "FINETUNE_DATA_PATH is valid" + check_file_valid "${WORK_PATH}/code/${EVAL_DATASET_PATH}" || { logger_Warn "EVAL_DATASET_PATH:${EVAL_DATASET_PATH} not valid path" ; return 1; } + logger_Debug "EVAL_DATASET_PATH is valid" + fi + +} + +function ckpt_merge() +{ + transform_ckpt_path=$WORK_PATH/code/mindformers/tools/transform_ckpt.py + result_output_path=$WORK_PATH/../result/output + cd $WORK_PATH + # ckpt merge + $PYTHON_COMMAND $transform_ckpt_path \ + --src_ckpt_strategy $result_output_path/strategy/ \ + --src_ckpt_dir $result_output_path/checkpoint/ \ + --dst_ckpt_dir $result_output_path/target_ckpt/ \ + --prefix "glm2_6b" || { logger_Warn "ckpt merge failed, rank id range: $RANK_ID_RANGE" ; return $ret_failed; } + rm -rf $result_output_path/checkpoint/ +} + +function node_train() +{ + logger_Info "node_train running" + export GLM_CUR_RUN_MODE=$1 + source $WORK_PATH/config/config.sh + run_script_path=$WORK_PATH/code/scripts/ + run_yaml_path=$WORK_PATH/code/configs/glm2/$GLM_RUN_YAML_NAME + rank_table_path=${WORK_PATH}/$RANK_TABLE_FILE + run_processed_yaml=$WORK_PATH/$GLM_RUN_YAML_NAME + cp $run_processed_yaml $run_yaml_path + # train run + cd $run_script_path + cmd="bash run_distribute.sh $rank_table_path $run_yaml_path $RANK_ID_RANGE $1" + [ "$NODEINFO_FILE" != "" ] && cmd="$cmd $RANK_SIZE" + echo "$cmd" + $cmd || { logger_Warn "node_run failed, rank id range: $RANK_ID_RANGE" ; return $ret_failed; } + mv $WORK_PATH/code/output/ $WORK_PATH/../result/ || { logger_Warn "move output failed!" ; return $ret_failed; } # store in $WORK_PATH/../result/output + return $ret_ok +} + +function eval_run() +{ + logger_Info "eval_run running" + eval_yaml_path=$WORK_PATH/code/configs/glm2/$GLM_EVAL_YAML_NAME + eval_processed_yaml=$WORK_PATH/$GLM_EVAL_YAML_NAME + cp $eval_processed_yaml $eval_yaml_path + eval_dataset_path=$WORK_PATH/code/$EVAL_DATASET_PATH + load_checkpoint_path=$WORK_PATH/../result/output/target_ckpt/rank_0/glm2_6b0.ckpt + if [ "$EVAL_DATASET_TYPE" = "ADGEN" ];then + echo "run eval using ADGEN" + eval_script_path=$WORK_PATH/code/run_mindformer.py + $PYTHON_COMMAND $eval_script_path \ + --config $eval_yaml_path \ + --eval_dataset_dir $eval_dataset_path \ + --run_mode eval \ + --load_checkpoint $load_checkpoint_path \ + --use_parallel False \ + --device_id $EVAL_DEVICE_ID || { logger_Warn "run eval failed" ; return $ret_failed; } + else + echo "invalid eval mode" + rm -rf $load_checkpoint_path + return $ret_failed + fi + rm -rf $load_checkpoint_path + return $ret_ok +} + +function node_eval() +{ + logger_Info "node_eval running" + if [ "$GLM_RUN_MODE" == "only_finetune" ];then + eval_run + else + echo "glm2 run mode not supported" + return $ret_failed + fi + return $ret_ok +} + +main() +{ + type="$1" + mode="$2" + shift + node_init $type || { logger_Warn "init failed"; return $ret_failed; } + if [ "$type" == "train" ];then + node_train $mode || { logger_Warn "run_node_train failed"; return $ret_failed; } + elif [ "$type" == "merge" ];then + ckpt_merge || { logger_Warn "ckpt_merge failed"; return $ret_failed; } + elif [ "$type" == "eval" ];then + node_eval || { logger_Warn "run_node_eval failed"; return $ret_failed; } + elif [ "$type" == "check" ];then + node_check || { logger_Warn "run_node_check failed"; return $ret_failed; } + else + { logger_Warn "invalid argument '${type}'"; return $ret_failed; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..abe3be71802a849f3c732984e547f4468e7435e5 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/build.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + + mkdir -p ${CURDIR}/output/config + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/config.sh -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/modelarts_config.py -r ${CURDIR}//output/config/ + [ "$1" == "r1.3" ] && { cp ${CURDIR}/config/modelarts_config.py.r1.3 -r ${CURDIR}//output/config/modelarts_config.py; } + [ -d ${CURDIR}/doc ] && cp ${CURDIR}/doc -r ${CURDIR}/output/ + + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..3322bacb30f86d459100ff32e212da8d2c3160dc --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/config/config.sh @@ -0,0 +1,18 @@ +export PYTHON_COMMAND=python3.7 +export TRAIN_DATASET=/home/datasets/wmt16/mindrecord/train.tok.clean.bpe.32000.en.mindrecord +export TEST_DATASET=/home/datasets/wmt16/mindrecord/newstest2014.en.mindrecord +export EXISTED_CKPT_PATH=/home/datasets/pretrain_model/gnmt_v2/gnmtv2_ascend_v180_wmtende_official_nlp_acc24.ckpt +export VOCAB_ADDR=/home/datasets/wmt16/vocab.bpe.32000 +export BPE_CODE_ADDR=/home/datasets/wmt16/bpe.32000 +export TEST_TARGET=/home/datasets/wmt16/newstest2014.de + +# 8p +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# options needed only if rank_size > 1 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json + +# needed only in cluster mode +# export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..e9540bd680d08e5ab3bbd8a2a3019756a50f746d --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.9" ];then + branch="r1.8" + patch_file_name="r1.9" + commitid="f4eed0f958b40992ee96dd6cfefd76ae989c872f" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/gnmt_v2" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..024184fc5dec7b43a4b89b157c561967cfdf172a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/r1.9.patch @@ -0,0 +1,72 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-11-02 17:55:22.768000000 +0800 ++++ code/eval.py 2022-11-02 17:55:22.784000000 +0800 +@@ -101,6 +101,16 @@ + tokenizer = Tokenizer(vocab, bpe_codes, 'en', 'de') + scores = bleu_calculate(tokenizer, result_npy_addr, test_tgt) + print(f"BLEU scores is :{scores}") ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ print("ACC_DIR:", ACC_DIR) ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(scores)) + + if __name__ == '__main__': + run_eval() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-11-02 17:55:22.776000000 +0800 ++++ code/train.py 2022-11-02 17:55:22.784000000 +0800 +@@ -41,6 +41,12 @@ + from model_utils.config import config as default_config + from model_utils.moxing_adapter import moxing_wrapper + from model_utils.device_adapter import get_device_id, get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def _train(model, config, + pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None, +@@ -222,11 +228,35 @@ + callbacks.append(summary_callback) + + print(f" | ALL SET, PREPARE TO TRAIN.") ++ model.build(dataset, sink_size=dataset.get_dataset_size(), epoch=config.epochs) ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + _train(model=model, config=config, + pre_training_dataset=pre_training_dataset, + fine_tune_dataset=fine_tune_dataset, + test_dataset=test_dataset, + callbacks=callbacks) ++ all_data_sum = config.epochs * dataset.get_dataset_size() * config.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum / (end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + def _setup_parallel_env(): diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..d72557eb7ae6ba01fa1ce4fba4f7bbe69989c190 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + : "${EVAL_DATA_PATH?EVAL_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + + [ -d $BASE_PATH/result ] && cp ${RESULT_PATH}/* -rf $BASE_PATH/result/ + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..d84c63858f38dff71197ab926803a91ef3e9d9f1 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_gnmt_v2/scripts/run_node.sh @@ -0,0 +1,87 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +function get_train_cmd() +{ + CONFIG_FILE=$WORK_PATH/code/default_config.yaml + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py \ + --config_path=$CONFIG_FILE \ + --pre_train_dataset=$TRAIN_DATA_PATH \ + --device_id=${DEVICE_ID} + " + return 0 +} + +function get_eval_cmd() +{ + CONFIG_FILE=$WORK_PATH/code/default_test_config.yaml + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --config_path=$CONFIG_FILE \ + --test_dataset=$EVAL_DATA_PATH \ + --existed_ckpt=$EXISTED_CKPT_PATH \ + --vocab=$VOCAB_ADDR \ + --bpe_codes=$BPE_CODE_ADDR \ + --test_tgt=$TEST_TARGET" + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + cp $CHECKPOINT_PATH $RESULT_PATH/ + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/README.md b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0bb01a710e92c922e704bce112d3384e4dd35f54 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/README.md @@ -0,0 +1,156 @@ +# 基于MindSpore/mindformers框架的llama大模型训练负载使用指南 +本文主要介绍使用基于LLaMA 或LLaMA2 大模型训练业务代码构建的AISBench的负载包"train_huawei_train_mindspore_llama-Ais-Benchmark-Stubs-{arch}-2.0-r2.2.tar.gz",进行服务器性能测试的流程。 +## 名词定义 +|名词|定义| +| --- | --- | +|管理节点|运行大模型训练负载的环境,只有一个| +|计算节点|执行训练任务的环境,可以有多个| +## 运行环境前置条件 +### 管理节点 +``` +python >= 3.7 +``` +### 计算节点 +``` +mindspore >= 2.2 +``` +MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行,验证命令: +```bash +python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" +``` +如果正常输出: +```bash +MindSpore version: 版本号 +The result of multiplication calculation is correct, MindSpore has been installed on platform [Ascend] successfully! +``` +说明成功。 +### 单机多卡与多机多卡的区别 +单机多卡执行负载时,只在单机环境上部署和运行即可;多机多卡执行负载时,多机就是多个计算节点,管理节点必须是其中一个计算节点。
+**多机多卡需注意** +1. 为确保能操作计算节点的数据,管理节点需要是root用户 +## 负载包中文件夹主要目录结构 + +``` +├── ais-bench-stubs # Stubs主程序,负责流程控制、通信与数据管理等 +├── code # 业务代码目录 +│ ├── benchmark.sh # 入口脚本,会被ais-bench-stubs调用,调用业务代码,被测试者需要通过编写该脚本,对接运行的训练和推理脚本 +│ ├── config +│ │ ├── config.sh # 训练相关的配置文件,包括数据集、权重路径等信息 +│ └── code # mindformers全部代码,嵌入了AISBench的打点上报接口 +│ └──mindformers +├── config +│ ├── config.json # 测试配置文件,包含tester服务器信息、testerId等信息 +│ └── system.json # 被测试环境系统基本信息json文件,被测试者自行上传,比如硬件信息等 +├── dependencies # stubs的依赖组件 +│ ├── cluster # 分布式运行组件 +│ │ ├── ais_bench_cluster--py3-none-linux_.whl +│ │ ├── README.md +│ └── logging # 测试结果传输模块 +│ ├── ais_utils.py # 打点入口脚本,设置相关业务运行参数并反馈测试结果 +│ └── libais_utils.so # 测试结果传输模块lib,负责将测试结果传输到stubs模块相关文件部署 +├── log # 测试log日志。建议无需上传的日志文件,另建目录存放 +├── result # 测试结果文件。建议无需上传的结果文件,另建目录存放 +└── STUBS_PACKAGE_INTRO.md # Stubs被测试者接入使用文档 +``` +**后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./ais-bench-stubs表示Stubs主程序** +## 资源准备 +### 前置声明 +运行LLaMA(LLaMA2)训练的MindSpore/mindformers的代码全部在`./code/code`文件夹中,资源的准备参考[LLaMA资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama.md)或[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md),具体资源的参考详见本章其他小节。
+**注意**:需要确认计算节点中是否原来已经安装了MindFormers,如果安装了,请使用`pip uninstall mindformers`卸载,确保负载代码的MindFormers能正常在计算节点中安装。 +### rank_table_file准备(llama和llama2通用) +确保`/etc/hccn.conf`文件已经配好(如果没配好,参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置)。 + +参考[LLaMA资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”和“多机RANK_TABLE_FILE合并(多机多卡必备环节)”章节。 + +### 模型权重下载与转换 +LLaMA参考[LLaMA资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama.md)的“模型权重下载与转换”章节; +LLaMA2参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节。 + +### 数据集准备 +#### 预训练数据集准备 +LLaMA参考[LLaMA资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama.md)的“预训练/数据集准备-预训练”章节; +LLaMA2参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节。 +#### 微调数据集准备 +LLaMA参考[LLaMA资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama.md)的“微调/数据集准备-微调”章节; +LLaMA2参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节。 +#### 评测数据集准备 +**wikitext** +LLaMA参考[LLaMA资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama.md)的“评测/文本生成/获取数据集”章节; +LLaMA2参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节。 + +### nodeinfo_file准备(多机多卡训练需要) +nodeinfo_file为json文件,需要用户自行创建(如nodeinfo_file.json)并按照如下格式配置计算节点信息(不要把注释加进去): +```json +{ + "0": { // 计算节点编号,为用户自定义,非设备实际编号,配置要求:不能重复、必须是0开始的连续整数,例如共有4个节点,节点编号只能取0,1,2,3。若不同节点配置了相同编号,那么只会读取其中一个节点的信息,另一个节点信息则被覆盖,实际运行测试时被覆盖的节点不会被测试。 + "ip": "xx.xx.xx.xx", // 计算节点的ip地址 ipv4 + "user": "user0", // 计算节点的用户名 + "port": 12345, // 访问计算节点的端口 + "work_path": "/xx/xx/xx/xx" // 计算节点的工作路径,管理节点进入节点后处于的路径 + }, + "1":{ + ... + } + ... +} +``` +**注意**:作为多机多卡时的管理节点的计算节点,work_path必须填写`train_huawei_train_mindspore_llama-Ais-Benchmark-Stubs-{arch}-2.0-r2.2/`目录的绝对路径 + +## 负载启动前配置项 +### 和tester连接的配置(仅在线测试需要) +`./config/config.json`和`./config/system.json`请参考《Stubs被测试者接入使用文档》中的“配置与Tester相关的配置文件”章节以及测试机构的要求进行配置。 +### ./code/config/config.sh配置 +`./code/config/config.sh`内容如下: +```bash +#!/bin/bash +echo "set env of llama train" +# 后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./ais-bench-stubs表示Stubs主程序 + +export PYTHON_COMMAND=python3 +# 以下cluster配置二选一,仅多机场景需要 +export CLUSTER_SSH_KEY_PATH=~/.ssh/id_rsa # 用户指定的ssh私钥,需要确保管理节点通过此私钥能免密访问所有计算节点 +export CLUSTER_AUTO_SET_KEY='on' # 'off' or 'on',若为'on' 不需要配置CLUSTER_SSH_KEY_PATH + + +export LLAMA_MODEL_SCALE='7b' # '7b' 、'13b' 、 '70b'(仅llama2支持) +export LLAMA_MODEL_TYPE='' # llama : '' ; llama2 : '2' +export LLAMA_RUN_MODE='only_pretrain' # 'only_pretrain', 'only_finetune',决定负载时执行预训练还是微调 + +# PRETRAIN_DATA_PATH, FINETUNE_DATA_PATH, EVAL_DATASET_PATH 这三个路径是相对./code/code/mindformers源码的路径, 必须以./mindformers/开头 +export PRETRAIN_DATA_PATH=./mindformers/dataset_files/wikitext-2/wiki2048.mindrecord # 预训练数据集 +export FINETUNE_DATA_PATH=./mindformers/dataset_files/alpaca-fastchat2048.mindrecord # 微调数据集 +export EVAL_DATASET_TYPE='wikitext' # 评估数据集名,可选 'wikitext' +export EVAL_DATASET_PATH=./mindformers/dataset_files/wikitext-2/wiki2048valid.mindrecord # 评测用的数据集路径,必须以./mindformers/开头 +export FINETUNE_CKPT_PATH=../../open_llama_7b.ckpt # only for 'only_finetune',相对于train_huawei_train_mindspore_llama-Ais-Benchmark-Stubs-{arch}-2.0-r2.2/code/目录的路径,权重放在code/内将自动分发到计算节点 +export EVAL_DEVICE_ID=0 # 评测用的npu 的device id + +export EPOCH_SIZE=1 # 全量遍历数据集的迭代次数 +export LLAMA_LAYER_NUM=32 # 7b:32 13b:40 70b: 80 + +export RANK_SIZE=8 # 集群总加速卡数 +export DEVICE_NUM=8 # 集群每个节点的加速卡数 + +# parallel run params, parallel strategy config, DATA_PARALLEL * MODEL_PARALLEL * PIPELINE_STAGE should equal to RANK_SIZE +export DATA_PARALLEL=2 +export MODEL_PARALLEL=1 +export PIPELINE_STAGE=4 + +# need if rank_size > 1 +export RANK_TABLE_FILE=hccl_xxxx_8p.json # rank_table_file的路径,相对于train_huawei_train_mindspore_llama-Ais-Benchmark-Stubs-{arch}-2.0-r2.2/code/目录的路径,rank_table_file需要放在code/内 + +# 多机多卡需要配置,单机不需要配置 +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json +``` +请参考`./code/config/config.sh`的注释将第一章准备的资源的路径在`config.sh`中配置好,并且确定好训练相关的参数 + +## 负载启动 +### 在线测试 +在管理节点上执行命令 +```bash +./ais-bench-stubs +``` +### 轻量化离线测试 +在管理节点上执行命令 +```bash +./ais-bench-stubs test +``` \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..568350c9ca7505e9333a706472ddf9b7dc183caf --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/build.sh @@ -0,0 +1,34 @@ +#!/bin/bash +echo "start to build llama workload" + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + return $ret_ok +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp -f ${CURDIR}/README.md ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + + mkdir -p ${CURDIR}/output/config + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/config.sh -r ${CURDIR}//output/config/ + [ -d ${CURDIR}/doc ] && cp ${CURDIR}/doc -r ${CURDIR}/output/ + + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..ce638350f4b549ae3b8a41790f4329def72ef4b6 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/config/config.sh @@ -0,0 +1,38 @@ +#!/bin/bash +echo "set env of llama train" +# 后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./ais-bench-stubs表示Stubs主程序 + +export PYTHON_COMMAND=python3 + +# 以下cluster配置二选一,仅多机场景需要 +export CLUSTER_SSH_KEY_PATH=~/.ssh/id_rsa # 用户指定的ssh私钥,需要确保管理节点通过此私钥能免密访问所有计算节点 +export CLUSTER_AUTO_SET_KEY='on' # 'off' or 'on',若为'on' 不需要配置CLUSTER_SSH_KEY_PATH + +export LLAMA_MODEL_SCALE='7b' # '7b' 、'13b' 、 '70b'(仅llama2支持) +export LLAMA_MODEL_TYPE='' # llama : '' ; llama2 : '2' +export LLAMA_RUN_MODE='only_pretrain' # 'only_pretrain', 'only_finetune',决定负载时执行预训练还是微调 + +# PRETRAIN_DATA_PATH, FINETUNE_DATA_PATH, EVAL_DATASET_PATH 这三个路径是相对./code/code/mindformers源码的路径, 必须以./mindformers/开头 +export PRETRAIN_DATA_PATH=./mindformers/dataset_files/wikitext-2/wiki2048.mindrecord # 预训练数据集 +export FINETUNE_DATA_PATH=./mindformers/dataset_files/alpaca-fastchat2048.mindrecord # 微调数据集 +export EVAL_DATASET_TYPE='wikitext' # 评估数据集名,可选 'wikitext' +export EVAL_DATASET_PATH=./mindformers/dataset_files/wikitext-2/wiki2048valid.mindrecord # 评测用的数据集路径,必须以./mindformers/开头 +export FINETUNE_CKPT_PATH=open_llama_7b.ckpt # only for 'only_finetune',相对于train_huawei_train_mindspore_llama-Ais-Benchmark-Stubs--2.0-r2.2/code/目录的路径,权重放在code/内将自动分发到计算节点 +export EVAL_DEVICE_ID=0 # 评测用的npu 的device id + +export EPOCH_SIZE=1 # 全量遍历数据集的迭代次数 +export LLAMA_LAYER_NUM=32 # 7b:32 13b:40 70b: 80 + +export RANK_SIZE=8 # 集群总加速卡数 +export DEVICE_NUM=8 # 集群每个节点的加速卡数 + +# parallel run params, parallel strategy config, DATA_PARALLEL * MODEL_PARALLEL * PIPELINE_STAGE should equal to RANK_SIZE +export DATA_PARALLEL=2 +export MODEL_PARALLEL=1 +export PIPELINE_STAGE=4 + +# need if rank_size > 1 +export RANK_TABLE_FILE=hccl_xxxx_8p.json # rank_table_file的路径,相对于train_huawei_train_mindspore_llama-Ais-Benchmark-Stubs--2.0-r2.2/code/目录的路径,rank_table_file需要放在code/内 + +# 多机多卡需要配置,单机不能配置 +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b4e8ebc7956666d75cfb0aa869cba10700bfa46 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r2.2"; } + + modelzoo_sub_dir="mindformers" + if [ "$branch_args" == "r2.2" ];then + branch="r0.8" + patch_file_name="r2.2" + commitid="c0f478fc517b1daec896f5c72bcea10b2ab83bd4" + git_url="https://gitee.com/mindspore/mindformers.git" + else + echo "bad parameters : $1" + return $ret_error + fi + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + local changed_code_path="$4" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + target_dir=$changed_code_path + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/r2.2.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/r2.2.patch new file mode 100644 index 0000000000000000000000000000000000000000..73b95b31a6e27892428d81f8601ecf6caf34b85f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/r2.2.patch @@ -0,0 +1,231 @@ +diff -Nur origin/mindformers/core/callback/callback.py code/mindformers/core/callback/callback.py +--- origin/mindformers/core/callback/callback.py 2023-12-06 11:50:58.448000000 +0800 ++++ code/mindformers/core/callback/callback.py 2023-12-06 11:50:58.540000000 +0800 +@@ -33,7 +33,11 @@ + from mindformers.tools.register import MindFormerRegister, MindFormerModuleType + from mindformers.tools.cloud_adapter.cloud_adapter import Local2ObsMonitor + from mindformers.tools.logger import logger +-from mindformers.tools.utils import get_output_root_path, get_output_subpath, get_remote_save_url, check_in_modelarts ++from mindformers.tools.utils import get_output_root_path, get_output_subpath, get_remote_save_url, check_in_modelarts, save_aisbench_result ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + __all__ = ['ObsMonitor', 'MFLossMonitor', 'CheckpointMointor', 'SummaryMonitor', 'ProfileMonitor', 'EvalCallBack'] + +@@ -551,6 +555,7 @@ + + if save_ckpt: + logger.info('......Saving ckpt......') ++ save_aisbench_result("model_persistence_start_time", ais_utils.get_datatime().decode('utf-8')) + cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \ + + str(step_num_in_epoch) + ".ckpt" + # update checkpoint file list. +@@ -629,7 +634,7 @@ + self._config.async_save, {}, self._config.enc_key, self._config.enc_mode) + + save_only_network_params() +- ++ save_aisbench_result("model_persistence_end_time", ais_utils.get_datatime().decode('utf-8')) + self._latest_ckpt_file_name = cur_file + + +diff -Nur origin/mindformers/core/metric/metric.py code/mindformers/core/metric/metric.py +--- origin/mindformers/core/metric/metric.py 2023-12-06 11:50:58.448000000 +0800 ++++ code/mindformers/core/metric/metric.py 2023-12-06 11:50:58.540000000 +0800 +@@ -39,6 +39,11 @@ + + from .utils import PerplexityCell + from ...dataset.labels import cluener_labels ++from mindformers.tools.utils import save_aisbench_result ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + __all__ = ['EntityScore', 'SQuADMetric', 'PerplexityMetric', 'ADGENMetric', 'PromptAccMetric', 'EmF1Metric'] + +@@ -541,6 +546,12 @@ + return None + avg_loss = float(self.total_loss / self.num_data) + result = {"loss": avg_loss, "PPL": math.exp(avg_loss)} ++ result_log="loss: {}, Perplexity: {}".format(avg_loss, math.exp(avg_loss)) ++ try: ++ import ais_utils ++ ais_utils.set_result("training", "accuracy", result_log) ++ except Exception: ++ print("ais_utils not find") + if self.pipeline_parallel: + print("Average Loss and PPL Metric:", result) + return result +diff -Nur origin/mindformers/tools/transform_ckpt.py code/mindformers/tools/transform_ckpt.py +--- origin/mindformers/tools/transform_ckpt.py 2023-12-06 11:50:58.456000000 +0800 ++++ code/mindformers/tools/transform_ckpt.py 2023-12-06 11:50:58.548000000 +0800 +@@ -17,6 +17,11 @@ + import argparse + + import mindspore as ms ++from mindformers.tools.utils import save_aisbench_result ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + def get_strategy(startegy_path, rank_id=None): + """Merge strategy if strategy path is dir +@@ -84,6 +89,8 @@ + print(f"dst_ckpt_dir: {dst_ckpt_dir}") + print(f"prefix: {prefix}") + +- print("......Start transform......") ++ print("......Start transform......") # model_format_start_time ++ ais_utils.set_result("training", "model_format_start_time", ais_utils.get_datatime().decode('utf-8')) + ms.transform_checkpoints(src_ckpt_dir, dst_ckpt_dir, prefix, src_ckpt_strategy, dst_ckpt_strategy) +- print("......Transform succeed!......") ++ print("......Transform succeed!......") # model_format_end_time ++ ais_utils.set_result("training", "model_format_end_time", ais_utils.get_datatime().decode('utf-8')) +diff -Nur origin/mindformers/tools/utils.py code/mindformers/tools/utils.py +--- origin/mindformers/tools/utils.py 2023-12-06 11:50:58.456000000 +0800 ++++ code/mindformers/tools/utils.py 2023-12-06 11:50:58.548000000 +0800 +@@ -55,6 +55,28 @@ + _PROTOCOL = 'obs' + _PROTOCOL_S3 = 's3' + ++AISBENCH_RESULT_PATH=os.getenv('RESULT_PATH') ++ ++ ++def save_aisbench_result(rt_key:str, rt_value): ++ cur_rank_id = os.getenv("RANK_ID", "0") ++ cur_run_mode = os.getenv("LLAMA_CUR_RUN_MODE", "train") ++ result_file_path = os.path.join(AISBENCH_RESULT_PATH, f"result_rank_{cur_rank_id}.json") ++ if not os.path.exists(result_file_path): ++ with open(result_file_path, "w") as f: ++ init_data = {} ++ json.dump(init_data, f) ++ with open(result_file_path, "r") as result_file: ++ result_log = json.load(result_file) ++ if not cur_run_mode in result_log: ++ result_log[cur_run_mode] = {} ++ if rt_key in result_log[cur_run_mode]: ++ result_log[cur_run_mode][rt_key].append(rt_value) ++ else: ++ result_log[cur_run_mode][rt_key] = [rt_value] ++ with open(result_file_path, "w") as result_file: ++ json.dump(result_log, result_file) ++ + + def check_in_modelarts(): + """Check if the training is on modelarts. +diff -Nur origin/mindformers/trainer/base_trainer.py code/mindformers/trainer/base_trainer.py +--- origin/mindformers/trainer/base_trainer.py 2023-12-06 11:50:58.456000000 +0800 ++++ code/mindformers/trainer/base_trainer.py 2023-12-06 11:50:58.552000000 +0800 +@@ -42,7 +42,7 @@ + from mindformers.wrapper import build_wrapper + from mindformers.tools.register import MindFormerConfig + from mindformers.tools.logger import logger +-from mindformers.tools.utils import count_params ++from mindformers.tools.utils import count_params, save_aisbench_result + from mindformers.auto_class import AutoModel + from mindformers.pet import get_pet_model + from .config_args import ConfigArguments +@@ -51,6 +51,10 @@ + from .optimizer_grouped_parameters import get_optimizer_grouped_parameters + from .utils import set_seed, check_train_data_loader_type, \ + check_eval_data_loader_type, check_optimizer_and_lr_type, check_wrapper_config ++try: ++ import ais_utils ++except Exception: ++ print("ais_utils not find") + + SUPPORT_TASKS = MindFormerBook().get_trainer_support_task_list() + SUPPORT_MODEL_NAMES = MindFormerBook().get_model_name_support_list() +@@ -543,7 +547,8 @@ + load_resume_context_from_checkpoint(config) + + # build dataset +- logger.info(".........Build Dataset For Train..........") ++ logger.info(".........Build Dataset For Train..........") # dataload_start_time ++ save_aisbench_result("dataload_start_time", ais_utils.get_datatime().decode('utf-8')) + if dataset is None: + dataset = self.create_train_dataset() + self.set_train_dataset(dataset) +@@ -553,10 +558,12 @@ + * config.runner_config.sink_size / dataset.get_dataset_size()) + # pylint: disable=W0212 + dataset._dataset_helper = DatasetHelper(dataset, config.runner_config.sink_mode, +- config.runner_config.sink_size, epoch_num) ++ config.runner_config.sink_size, epoch_num) # dataload_end_time ++ save_aisbench_result("dataload_end_time", ais_utils.get_datatime().decode('utf-8')) + + # build network +- logger.info(".........Build Net For Train..........") ++ logger.info(".........Build Net For Train..........") # train_launch_start_time ++ save_aisbench_result("train_launch_start_time", ais_utils.get_datatime().decode('utf-8')) + eval_network = None + if network is None and wrapper is None and \ + self.model_wrapper is None and self.network is None: +@@ -651,18 +658,30 @@ + step_interval=config.eval_step_interval if config.eval_step_interval else 100, + epoch_interval=config.eval_epoch_interval if config.eval_epoch_interval else -1, + ) +- callbacks.append(eval_callback) ++ callbacks.append(eval_callback) # train_launch_end_time + + logger.info(".........Starting Training Model..........") + if int(os.getenv("RANK_ID", '0')) % 8 == 0: + pprint(config) + logger.info(".........Model Compiling, Please Wait a Moment...........") ++ model.build(dataset, None, sink_size=config.runner_config.sink_size, epoch=config.runner_config.epochs) ++ save_aisbench_result("train_launch_end_time", ais_utils.get_datatime().decode('utf-8')) ++ import time ++ train_start_time = time.time() ++ save_aisbench_result("train_start_time", ais_utils.get_datatime().decode('utf-8')) # train_start_time + model.train(config.runner_config.epochs, dataset, + callbacks=callbacks, + dataset_sink_mode=config.runner_config.sink_mode, + sink_size=config.runner_config.sink_size, + initial_epoch=config.runner_config.initial_epoch) +- logger.info(".........Training Over!.............") ++ logger.info(".........Training Over!.............") # train_end_time ++ train_end_time = time.time() ++ all_data_sum = int(dataset.get_dataset_size() * config.train_dataset.batch_size / int(os.getenv("RANK_SIZE", '8'))) * \ ++ config.runner_config.origin_epochs * config.model.model_config.seq_length ++ throughput_rate = all_data_sum / (train_end_time - train_start_time) ++ save_aisbench_result("train_end_time", ais_utils.get_datatime().decode('utf-8')) ++ save_aisbench_result("throughput_ratio", throughput_rate) ++ + + def evaluate_process( + self, +diff -Nur origin/requirements.txt code/requirements.txt +--- origin/requirements.txt 2023-12-06 11:50:58.396000000 +0800 ++++ code/requirements.txt 2023-12-06 11:50:58.492000000 +0800 +@@ -11,4 +11,5 @@ + pydantic==1.10.11 + mdtex2html + gradio +-opencv-python-headless +\ No newline at end of file ++opencv-python-headless ++pyyaml +\ No newline at end of file +diff -Nur origin/scripts/run_distribute.sh code/scripts/run_distribute.sh +--- origin/scripts/run_distribute.sh 2023-12-06 11:50:58.460000000 +0800 ++++ code/scripts/run_distribute.sh 2023-12-06 11:50:58.556000000 +0800 +@@ -179,7 +179,14 @@ + fi + fi + shopt -u extglob +- ++wait ++if [ $? -eq 0 ];then ++ echo "all train processes completed successfully!" ++ exit 0 ++else ++ echo "one or more train processes exited with a error!" ++ exit 1 ++fi + + #cd ./pretrain_parallel${START_DEVICE} || exit + #tail -f mindformer.log diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..a0282578d3b7db9cbc9b44d8dd838fe8230a3eb4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/benchmark.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 +declare -i ret_mode_failed=5 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) +export DEPEND_PATH=$BASE_PATH/dependencies/ + +function get_node_train_data() +{ + if [ "$LLAMA_RUN_MODE" = "only_finetune" ];then + [ ! -f $FINETUNE_CKPT_PATH ] || { echo "finetune base ckpt:$FINETUNE_CKPT_PATH";return $ret_failed; } + fi + return $ret_ok +} + +# 配置训练相关的环境变量 +source ${CODE_PATH}/config/config.sh || { logger_Warn "source file failed:$?";return $ret_init_failed; } + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh +if [ -d $PRETRAIN_DATA_PATH ];then + cp -r $PRETRAIN_DATA_PATH $CUR_PATH || { logger_Warn "ERROR: cp $PRETRAIN_DATA_PATH failed!";return $ret_init_failed; } +fi +if [ -d $FINETUNE_DATA_PATH ];then + cp -r $FINETUNE_DATA_PATH $CUR_PATH || { logger_Warn "ERROR: cp $FINETUNE_DATA_PATH failed!";return $ret_init_failed; } +fi + +. $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..48f1f14f9b8f5cc1f36bee0f6af06a224500741f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/cluster_offline_run.sh @@ -0,0 +1,146 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common_2.0.sh +. $CODE_PATH/common/node_common.sh + +# env check +export RELAT_WORK_PATH=work +export RELAT_RESULT_PATH=$RELAT_WORK_PATH/result +CONFIG_FILE="config.sh" +# set nodes work path. 仅仅是管理节点的work/ +export WORK_PATH=${BASE_PATH}/work +# set nodes result path +export RESULT_PATH=${WORK_PATH}/result +local_env_cmd="source /etc/profile; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + export PYTHONPATH=$WORK_PATH:$PYTHONPATH; + export PYTHONPATH=$WORK_PATH/logging:$PYTHONPATH; + source $WORK_PATH/config/$CONFIG_FILE" +env_cmd="source /etc/profile; + export WORK_PATH=\$PWD/$RELAT_WORK_PATH; + export RESULT_PATH=\$PWD/$RELAT_RESULT_PATH; + export PYTHONPATH=\$WORK_PATH:\$PYTHONPATH; + export PYTHONPATH=\$WORK_PATH/logging:\$PYTHONPATH; + source \$WORK_PATH/config/$CONFIG_FILE" + +check_env() +{ + # check ranktable set + : "${RANK_SIZE?RANK_SIZE not set}" + : "${DEVICE_NUM?DEVICE_NUM not set}" + [[ $RANK_SIZE -eq 1 ]] || : "${RANK_TABLE_FILE?RANK_TABLE_FILE not set}" + [[ $RANK_SIZE -eq 1 ]] && [[ -n "$RANK_TABLE_FILE" ]] && { echo "ranksize=1 should not set RANK_TABLE_FILE";return 1; } + + # check python + : "${PYTHON_COMMAND?PYTHON_COMMAND not set}" + [ "$NODEINFO_FILE" == "" ] && { echo "NODEINFO_FILE not set, will not check cluster";return 0; } + if pip show ais_bench_cluster >/dev/null 2>&1;then + logger_Info "ais_bench cluster module exist, won't be installed again" + else + cluster_whl_path="${DEPEND_PATH}/cluster/ais_bench_cluster-*.whl" + if [ -f $cluster_whl_path ];then + pip install $cluster_whl_path --force-reinstall || { logger_Error "install cluster failed!";return 1; } + else + logger_Error "can't find ais_bench cluster wheel package" + fi + fi + + # check nodeinfofile exist + [[ $RANK_SIZE -le 8 ]] || check_file_valid "${NODEINFO_FILE}" || { echo "nodeinfofile:${NODEINFO_FILE} not valid" ; return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Error "source file failed:$?";return 1; } + if [ -d ${DEPEND_PATH}/logging ];then + cp -r ${DEPEND_PATH}/logging ${CODE_PATH} + fi + check_env || { logger_Error "env check failed'" ; return 1; } + + # init ais_bench.cluster + cluster_init || { logger_Error "ais_bench_cluster init failed!";return 1; } + + # refresh result path + rm -rf ${BASE_PATH}/result;mkdir -p ${BASE_PATH}/result + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + rm -rf $WORK_PATH;mkdir -p $WORK_PATH + if [ "$NODEINFO_FILE" != "" ];then + cmd="rm -rf ${RELAT_WORK_PATH};mkdir -p ${RELAT_WORK_PATH}" + cluster_multi_exec "$cmd" serial || { logger_Error "renew workpath failed"; return 1; } + # copy code to node work path + fi + cp -r $CODE_PATH/* $WORK_PATH # CPU可以执行的都在host节点执行 + if [ "$NODEINFO_FILE" != "" ];then + # sync data if work_path not exist so new one.节点的work/ 路径是相对于在node_file中指定的work_path + cluster_multi_put "$WORK_PATH" "./" || { logger_Error "deploy code to work place failed"; return 1; } + fi + cmd="source /etc/profile; + export WORK_PATH=\$PWD/$RELAT_WORK_PATH; + source \$WORK_PATH/config/$CONFIG_FILE; + bash \$WORK_PATH/run_node.sh check" + cluster_multi_exec "$cmd" serial|| { return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + if [ "$LLAMA_RUN_MODE" == "full" ] || [ "$LLAMA_RUN_MODE" == "only_pretrain" ];then + if [ "$NODEINFO_FILE" == "" ];then + cmd="$local_env_cmd; + rm -rf $RESULT_PATH/*.json; + bash $WORK_PATH/run_node.sh train train " + else + cmd="$env_cmd; + rm -rf \$RESULT_PATH/*.json; + bash \$WORK_PATH/run_node.sh train train " + fi + cluster_multi_exec "$cmd" || { logger_Error "run train(pretrain) failed"; return 1; } + if [ "$NODEINFO_FILE" != "" ];then + cluster_multi_get "$RELAT_RESULT_PATH" "$BASE_PATH" || { logger_Error "cp result between nodes failed"; return 1; } + fi + export PYTHONPATH=$WORK_PATH/logging:$PYTHONPATH + bash $WORK_PATH/run_node.sh merge || { logger_Error "ckpt merge failed"; return 1; } + fi + if [ "$LLAMA_RUN_MODE" == "full" ] || [ "$LLAMA_RUN_MODE" == "only_finetune" ];then + if [ "$NODEINFO_FILE" == "" ];then + cmd="$local_env_cmd; + rm -rf $RESULT_PATH/*.json; + bash $WORK_PATH/run_node.sh train finetune " + else + cmd="$env_cmd; + rm -rf \$RESULT_PATH/*.json; + bash \$WORK_PATH/run_node.sh train finetune " + fi + cluster_multi_exec "$cmd" || { logger_Error "run train(finetune) failed"; return 1; } + if [ "$NODEINFO_FILE" != "" ];then + cluster_multi_get "$RELAT_RESULT_PATH" "$BASE_PATH" || { logger_Error "cp result between nodes failed"; return 1; } + fi + export PYTHONPATH=$WORK_PATH/logging:$PYTHONPATH + bash $WORK_PATH/run_node.sh merge || { logger_Error "ckpt merge failed"; return 1; } + fi + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="$local_env_cmd; + bash $WORK_PATH/run_node.sh eval" + eval "$cmd" || { logger_Error "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + source ${CODE_PATH}/config/$CONFIG_FILE + export PYTHONPATH=${CODE_PATH}/logging:$PYTHONPATH + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_llm_result.py ${BASE_PATH}/result ${RANK_SIZE} ${LLAMA_RUN_MODE} + find $BASE_PATH/result/ -name "*.ckpt" -exec rm {} \; + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/pre_conf_yaml.py b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/pre_conf_yaml.py new file mode 100644 index 0000000000000000000000000000000000000000..bf150fa73a0bed9ae3af1adbe06fd5ff53623ace --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/pre_conf_yaml.py @@ -0,0 +1,121 @@ + +# should be execute after distribut +import sys +import os +import yaml +import acl + +run_mode = sys.argv[1] +cur_path = os.path.dirname(os.path.abspath(__file__)) +try: + soc_version = acl.get_soc_name() +except Exception as err: + raise RuntimeError("get soc versiob failed!") from err + +pretrain_dataset = os.getenv('PRETRAIN_DATA_PATH') +finetune_dataset = os.getenv('FINETUNE_DATA_PATH') + +rank_size = int(os.getenv('RANK_SIZE')) +data_parallel = int(os.getenv('DATA_PARALLEL')) +model_parallel = int(os.getenv('MODEL_PARALLEL')) +pipeline_stage = int(os.getenv('PIPELINE_STAGE')) + +if not rank_size == data_parallel * model_parallel * pipeline_stage and run_mode != "finetune_eval": + raise RuntimeError("DATA_PARALLEL * MODEL_PARALLEL * PIPELINE_STAGE should equal to RANK_SIZE !") + +model_scale = os.getenv('LLAMA_MODEL_SCALE') +model_type = os.getenv('LLAMA_MODEL_TYPE') +config_path = os.path.join(cur_path, f'code/configs/llama{model_type}/') +epoch_size = os.getenv("EPOCH_SIZE") +sink_size = 2 +layer_num = os.getenv("LLAMA_LAYER_NUM") +eval_data_path = os.getenv('EVAL_DATASET_PATH') + +if '910B'in soc_version: + target_yaml = os.path.join(config_path, f'run_llama{model_type}_{model_scale}_910b.yaml') +else: + target_yaml = os.path.join(config_path, f'run_llama{model_type}_{model_scale}.yaml') +if os.getenv('LLAMA_RUN_MODE') == 'only_finetune': + ckpt_path = os.path.join(cur_path, os.getenv('FINETUNE_CKPT_PATH')) + if not os.path.exists(ckpt_path): + raise FileExistsError(f"ckpt_path: {ckpt_path} not find!") +else: + ckpt_path = os.path.join(cur_path, f'result/output/target_ckpt/rank_0/llama{model_type}_{model_scale}0.ckpt') + +if not os.path.exists(target_yaml): + raise FileExistsError(f"yaml file: {target_yaml} not find!") +if os.path.islink(target_yaml): + raise RuntimeError(f"yaml file: {target_yaml} is softlink!") + +if model_type == "2" and run_mode == "finetune_eval": + seq_length = 4096 +else: + seq_length = 2048 + +def change_parallel_params(data): + data_parallel = int(data['parallel_config']['data_parallel']) + model_parallel = int(data['parallel_config']['model_parallel']) + pipeline_stage = int(data['parallel_config']['pipeline_stage']) + + return int(rank_size / (data_parallel * model_parallel * pipeline_stage)) + +def write_pretrain_yaml(data): + data['load_checkpoint'] = '' + data['run_mode'] = 'train' + data['runner_config']['epochs'] = int(epoch_size) + data['runner_config']['sink_size'] = sink_size + data['parallel_config']['data_parallel'] = data_parallel + data['parallel_config']['model_parallel'] = model_parallel + data['parallel_config']['pipeline_stage'] = pipeline_stage + data['optimizer']['beta2'] = 0.95 + data['optimizer']['learning_rate'] = 3.e-4 + data['lr_schedule']['learning_rate'] = 3.e-4 + data['lr_schedule']['lr_end'] = 3.e-5 + data['train_dataset']['input_columns'] = ["input_ids"] + data['train_dataset']['data_loader']['dataset_dir'] = pretrain_dataset + data['eval_dataset']['data_loader']['dataset_dir'] = eval_data_path + data['model']['model_config']['num_layers'] = int(layer_num) + data['callbacks'][1]['save_checkpoint_steps'] = 100000 + return data + + +def write_finetune_yaml(data): + data['load_checkpoint'] = ckpt_path + data['run_mode'] = 'finetune' + data['runner_config']['epochs'] = int(epoch_size) + data['runner_config']['sink_size'] = sink_size + data['parallel_config']['data_parallel'] = data_parallel + data['parallel_config']['model_parallel'] = model_parallel + data['parallel_config']['pipeline_stage'] = pipeline_stage + data['optimizer']['beta2'] = 0.999 + data['optimizer']['learning_rate'] = 1.e-5 + data['lr_schedule']['learning_rate'] = 1.e-5 + data['lr_schedule']['lr_end'] = 1.e-5 + data['train_dataset']['input_columns'] = ["input_ids", "labels"] + data['train_dataset']['data_loader']['dataset_dir'] = finetune_dataset + data['eval_dataset']['data_loader']['dataset_dir'] = eval_data_path + data['model']['model_config']['num_layers'] = int(layer_num) + data['model']['model_config']['seq_length'] = seq_length + data['callbacks'][1]['save_checkpoint_steps'] = 100000 + return data + +data = {} +with open(target_yaml, 'r', encoding='utf-8') as file: + try: + data = yaml.safe_load(file) + except Exception as err: + raise RuntimeError(f"load target_yaml: {target_yaml} failed") from err + +if run_mode == 'train': + data = write_pretrain_yaml(data) +elif run_mode == "finetune": + data = write_finetune_yaml(data) +elif run_mode == "finetune_eval": + data = write_finetune_yaml(data) + +with open(target_yaml, 'w', encoding='utf-8') as file: + try: + yaml.safe_dump(data, file) + except Exception as err: + raise RuntimeError(f"dump target_yaml: {target_yaml} failed") from err + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..87a2dbed2ce0c1d812fdcc4f8725a9520a813cad --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_llama/scripts/run_node.sh @@ -0,0 +1,181 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +declare -i ret_ok=0 +declare -i ret_failed=1 + +SOC_VERSION=`python3 -c 'import acl;print(acl.get_soc_name())'` +if [[ "$SOC_VERSION" =~ "910B1" || "$SOC_VERSION" =~ "910B2" || "$SOC_VERSION" =~ "910B3" || "$SOC_VERSION" =~ "910B4" ]];then + LLAMA_RUN_YAML_NAME="run_llama${LLAMA_MODEL_TYPE}_${LLAMA_MODEL_SCALE}_910b.yaml" +else + LLAMA_RUN_YAML_NAME="run_llama${LLAMA_MODEL_TYPE}_${LLAMA_MODEL_SCALE}.yaml" +fi + +function get_node_rank_id_range() +{ + RANK_ID_RANGE="[0,8]" + # get server node id default is 0 + : "${NODE_ID:=0}" + # get rank start index + if [[ $DEVICE_NUM == 1 && $RANK_SIZE == 1 ]];then + : "${SINGLE_CARD_INDEX:=0}" + RANK_START=$SINGLE_CARD_INDEX + else + # get rank start index + RANK_START=`expr ${NODE_ID} \* $DEVICE_NUM` + fi + RANK_ID_MAX=$[DEVICE_NUM+RANK_START] + RANK_ID_RANGE="[$RANK_START,$RANK_ID_MAX]" +} + +function node_init() +{ + export PYTHONPATH=$WORK_PATH:$PYTHONPATH + + if [ $1 == "check" ];then + # install pyyaml + if pip show pyyaml >/dev/null 2>&1;then + logger_Info "pyyaml exist, won't be installed again" + else + pip_cmd="pip install pyyaml" + $pip_cmd || { logger_Warn "pyyaml install failed:$?";return $ret_failed; } + fi + # install mindformers + if pip show mindformers >/dev/null 2>&1;then + logger_Info "mindformers exist, won't be installed again" + else + cd $WORK_PATH/code + pip install . || { logger_Warn "mindformers install failed:$?";return $ret_failed; } + cd $WORK_PATH + fi + fi + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + get_node_rank_id_range + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + rank_table_path=${WORK_PATH}/${RANK_TABLE_FILE} + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$rank_table_path" || { logger_Warn "node common check failed" ; return $ret_failed; } + + # check_mindspore_run_ok_Ascend ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return $ret_failed; } + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return $ret_failed; } + logger_Debug "mindspore running successfully" + + if [ "$LLAMA_RUN_MODE" == "full" ] || [ "$LLAMA_RUN_MODE" == "train" ];then + check_file_valid "${WORK_PATH}/code/${PRETRAIN_DATA_PATH}" || { logger_Warn "PRETRAIN_DATA_PATH:${PRETRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "PRETRAIN_DATA_PATH is valid" + fi + + if [ "$LLAMA_RUN_MODE" == "full" ] || [ "$LLAMA_RUN_MODE" == "finetune" ];then + check_file_valid "${WORK_PATH}/code/${FINETUNE_DATA_PATH}" || { logger_Warn "FINETUNE_DATA_PATH:${FINETUNE_DATA_PATH} not valid path" ; return 1; } + logger_Debug "FINETUNE_DATA_PATH is valid" + check_file_valid "${WORK_PATH}/code/${EVAL_DATASET_PATH}" || { logger_Warn "EVAL_DATASET_PATH:${EVAL_DATASET_PATH} not valid path" ; return 1; } + logger_Debug "EVAL_DATASET_PATH is valid" + fi + +} + +function ckpt_merge() +{ + transform_ckpt_path=$WORK_PATH/code/mindformers/tools/transform_ckpt.py + result_output_path=$WORK_PATH/../result/output + cd $WORK_PATH + # ckpt merge + $PYTHON_COMMAND $transform_ckpt_path \ + --src_ckpt_strategy $result_output_path/strategy/ \ + --src_ckpt_dir $result_output_path/checkpoint/ \ + --dst_ckpt_dir $result_output_path/target_ckpt/ \ + --prefix "llama${LLAMA_MODEL_TYPE}_${LLAMA_MODEL_SCALE}" || { logger_Warn "ckpt merge failed, rank id range: $RANK_ID_RANGE" ; return $ret_failed; } + rm -rf $result_output_path/checkpoint/ +} + +function node_train() +{ + logger_Info "node_train running" + export LLAMA_CUR_RUN_MODE=$1 + source $WORK_PATH/config/config.sh + $PYTHON_COMMAND $WORK_PATH/pre_conf_yaml.py $1 # change yaml params + run_script_path=$WORK_PATH/code/scripts/ + run_yaml_path=$WORK_PATH/code/configs/llama${LLAMA_MODEL_TYPE}/$LLAMA_RUN_YAML_NAME + rank_table_path=${WORK_PATH}/$RANK_TABLE_FILE + # train run + cd $run_script_path + cmd="bash run_distribute.sh $rank_table_path $run_yaml_path $RANK_ID_RANGE $1" + [ "$NODEINFO_FILE" != "" ] && cmd="$cmd $RANK_SIZE" + echo "$cmd" + $cmd || { logger_Warn "node_run failed, rank id range: $RANK_ID_RANGE" ; return $ret_failed; } + mv $WORK_PATH/code/output/ $WORK_PATH/result/ || { logger_Warn "move output failed!" ; return $ret_failed; } + return $ret_ok +} + +function eval_run() +{ + logger_Info "eval_run running" + $PYTHON_COMMAND $WORK_PATH/pre_conf_yaml.py $1 # change yaml params + run_yaml_path=$WORK_PATH/code/configs/llama${LLAMA_MODEL_TYPE}/$LLAMA_RUN_YAML_NAME + eval_dataset_path=$WORK_PATH/code/$EVAL_DATASET_PATH + load_checkpoint_path=$WORK_PATH/../result/output/target_ckpt/rank_0/llama${LLAMA_MODEL_TYPE}_${LLAMA_MODEL_SCALE}0.ckpt + if [ "$EVAL_DATASET_TYPE" = "wikitext" ];then + echo "run eval using wiki" + eval_script_path=$WORK_PATH/code/run_mindformer.py + $PYTHON_COMMAND $eval_script_path \ + --config $run_yaml_path \ + --eval_dataset_dir $eval_dataset_path \ + --run_mode eval \ + --load_checkpoint $load_checkpoint_path \ + --epochs 1 \ + --use_parallel False \ + --device_id $EVAL_DEVICE_ID || { logger_Warn "run eval failed" ; return $ret_failed; } + elif [ "$EVAL_DATASET_TYPE" == "squad" ];then + echo "eval not supported yet" + else + echo "invalid eval mode" + rm -rf $load_checkpoint_path + return $ret_failed + fi + rm -rf $load_checkpoint_path + return $ret_ok +} + +function node_eval() +{ + logger_Info "node_eval running" + if [ "$LLAMA_RUN_MODE" == "full" ];then + eval_run "finetune_eval" + elif [ "$LLAMA_RUN_MODE" == "only_pretrain" ];then + echo "eval not supported yet" + elif [ "$LLAMA_RUN_MODE" == "only_finetune" ];then + eval_run "finetune_eval" + else + echo "llama run mode not supported" + return $ret_failed + fi + return $ret_ok +} + +main() +{ + type="$1" + mode="$2" + shift + node_init $type || { logger_Warn "init failed"; return $ret_failed; } + if [ "$type" == "train" ];then + node_train $mode || { logger_Warn "run_node_train failed"; return $ret_failed; } + elif [ "$type" == "merge" ];then + ckpt_merge || { logger_Warn "ckpt_merge failed"; return $ret_failed; } + elif [ "$type" == "eval" ];then + node_eval || { logger_Warn "run_node_eval failed"; return $ret_failed; } + elif [ "$type" == "check" ];then + node_check || { logger_Warn "run_node_check failed"; return $ret_failed; } + else + { logger_Warn "invalid argument '${type}'"; return $ret_failed; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..065998ce7d27c07c80771ff446961caaac88439b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..8a8bb379ec10658d7940fb0f9b89edbf2478da1b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/config/config.sh @@ -0,0 +1,17 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7 +export TRAIN_DATA_PATH=/home/datasets/pangu_30_step_ba64 +export PARAM_INIT_TYPE=fp32 +export MODE=2.6B +export STAGE_NUM=1 +export MICRO_SIZE=1 +export PER_BATCH=8 + +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# need if rank_size > 1 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_64.json +# cluster need for node info +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..5122d7c51fc9d6ec5d146a411c5d74246e6957e2 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="abc34438588942642e45e7cf1e516134952a2f86" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/nlp/pangu_alpha" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2bdca78d3eaa658e65fe15ee31a11fab400c2e16 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/cluster_offline_run.sh @@ -0,0 +1,78 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_FILE?TRAIN_DATA_FILE not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..fffd35b7dfde004525907179dc61ca0ec93b17eb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_pangu_alpha/scripts/run_node.sh @@ -0,0 +1,83 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="True" || DISTRUTE_ENABLE="False" + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py --distribute=$DISTRUTE_ENABLE \ + --device_num=$DEVICE_NUM \ + --data_url=$TRAIN_DATA_PATH \ + --run_type=train \ + --param_init_type=$PARAM_INIT_TYPE \ + --mode=$MODE \ + --stage_num=$STAGE_NUM \ + --micro_size=$MICRO_SIZE + --per_batch_size=$PER_BATCH + " + return 0 +} + +function get_eval_cmd() +{ + + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_file_valid "${TRAIN_DATA_FILE}" || { logger_Warn "TRAIN_DATA_FILE:${TRAIN_DATA_FILE} not valid file" ; return 1; } + logger_Debug "TRAIN_DATA_FILE is valid" +} + + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..abe3be71802a849f3c732984e547f4468e7435e5 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/build.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + + mkdir -p ${CURDIR}/output/config + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/config.sh -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/modelarts_config.py -r ${CURDIR}//output/config/ + [ "$1" == "r1.3" ] && { cp ${CURDIR}/config/modelarts_config.py.r1.3 -r ${CURDIR}//output/config/modelarts_config.py; } + [ -d ${CURDIR}/doc ] && cp ${CURDIR}/doc -r ${CURDIR}/output/ + + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..333b07143ddb373eadf2971d1a1104bd77f3bb54 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/config.sh @@ -0,0 +1,14 @@ +export PYTHON_COMMAND=python3.7 + +export TRAIN_DATA_PATH=/home/datasets/imagenet/train/ +export EVAL_DATA_PATH=/home/datasets/imagenet/val/ +export EPOCH_SIZE=90 + +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# need if rank_size > 1 +export RANK_TABLE_FILE=/home/lcm/tool/rank_table_8p.json + +# cluster need for node info +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/modelarts_config.py b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/modelarts_config.py new file mode 100644 index 0000000000000000000000000000000000000000..621913981e43670cc826c0442abb7b00321208a3 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/modelarts_config.py @@ -0,0 +1,134 @@ +from easydict import EasyDict as ed + +# 该部分为认证信息,请向相关运维同事咨询并填写 +access_config = ed({ + # 登录需要的ak sk信息 + 'access_key': '', + 'secret_access_key': '', + # 连接OBS的服务地址。可包含协议类型、域名、端口号。(出于安全性考虑,建议使用https协议) + # 如果是计算中心,需要联系运维同事获取 + 'server': '', + # project_id/region_name: + # 项目ID/区域ID,获取方式参考链接 + # https://support.huaweicloud.com/api-iam/iam_17_0002.html + # 如果是计算中心,请咨询相关维护同事 + 'region_name': '', + 'project_id': '', + + # 如下配置针对计算中心等专有云 通用云不需要设置 设置为空 请咨询相关维护同事 + # 设置该信息后 需要设置相关的域名解析地址 + 'iam_endpoint': '', + 'obs_endpoint': '', + 'modelarts_endpoint' : '', +}) + +session_config = ed({ + # 运行模型的传入超参 + 'hyperparameters': [ + # 模型配置文件,默认boost模式,不需要更改 + {'label': 'config_path', 'value': 'resnet50_imagenet2012_Boost_config.yaml'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'label': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'label': 'run_distribute', 'value': 'True'}, + # epoch次数 必须关注 当前默认设置为90 + {'label': 'epoch_size', 'value': '90'}, + # device数量 云上场景一般不需要修改 + {'label': 'device_num', 'value': '8'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'label': 'save_checkpoint', 'value': 'False'}, + # 保存ckpt的epoch数 必须修改并注意 该值必须要跟epoch数一致 这样提高性能 + {'label': 'save_checkpoint_epochs', 'value': '90'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/imagenet_small/', + # obs代码路径 程序会自动拷贝到该路径 + 'code_dir': '/zgwtest/lcm_test/resnet/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/resnet/train.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'MindSpore-1.3-cann_5.0.2-python3.7-euleros2.8-aarch64', + + # 资源参数类型主要包括如下2个值 train_instance_type和pool_id + # 不设置pool_id 默认是公共池 设置了就是专属资源池 + # 只设置pool_id 不设置train_instance_type 默认为专属资源池的默认类型 + # train_instance_type 在程序打印中有提示的 一般为如下四个值 分别对应 1卡 2卡 4卡 8卡 + # ['modelarts.kat1.xlarge', 'modelarts.kat1.2xlarge', 'modelarts.kat1.4xlarge', 'modelarts.kat1.8xlarge'] + # https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0191.html 该链接指示获取方法 + + # 专属资源池id 不是则为None + 'pool_id': None, + # 训练类型 如下为8卡 如果是专属资源池id设置,那么该类型需要设置为None + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练结点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type' : None, + # 'nas_share_addr' : None, + # 'nas_mount_path' : None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) + +session_config_v2 = ed({ + # 运行模型的传入超参 + 'parameters': [ + # 模型配置文件,默认boost模式,不需要更改 + {'name': 'config_path', 'value': 'resnet50_imagenet2012_Boost_config.yaml'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'name': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'name': 'run_distribute', 'value': 'True'}, + # epoch次数 必须关注 当前默认设置为90 + {'name': 'epoch_size', 'value': '90'}, + # device数量 云上场景一般不需要修改 + {'name': 'device_num', 'value': '8'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'name': 'save_checkpoint', 'value': 'False'}, + # 保存ckpt的epoch数 必须修改并注意 该值必须要跟epoch数一致 这样提高性能 + {'name': 'save_checkpoint_epochs', 'value': '90'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/imagenet_small/', + + # obs代码路径 程序会自动拷贝到该路径. 和boot_files一起用于复合参数 training_files + 'code_dir': '/zgwtest/lcm_test/resnet/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/resnet/train.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'mindspore_1.3.0-cann_5.0.2-py_3.7-euler_2.8.3-aarch64', + + # pool_id不设置或者设置为None, 默认是公共资源池。 设置了就表示是专属资源池。在ModelArts管理控制台,单击左侧“专属资源池”,在专属资源池列表中可以查看专属资源池ID,类似poolc90f063b + 'pool_id': None, + # 训练类型,默认8卡。 train_instance_type 在程序打印中有提示的,请注意紧随“get valid train_instance_types:”之后的打印输出. 由modelarts.estimatorV2 类Estimator的接口get_train_instance_types()查询而来。 + # 请参见https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0431.html 该链接指示获取方法。注意不同云环境查询的结果不同 + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练结点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type': None, + # 'nas_share_addr': None, + # 'nas_mount_path': None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/modelarts_config.py.r1.3 b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/modelarts_config.py.r1.3 new file mode 100644 index 0000000000000000000000000000000000000000..e7f207c9643ae5db6aced83511c147a9b01d9c5a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/config/modelarts_config.py.r1.3 @@ -0,0 +1,134 @@ +from easydict import EasyDict as ed + +# 该部分为认证信息,请向相关运维同事咨询并填写 +access_config = ed({ + # 登录需要的ak sk信息 + 'access_key': '', + 'secret_access_key': '', + # 连接OBS的服务地址。可包含协议类型、域名、端口号。(出于安全性考虑,建议使用https协议) + # 如果是计算中心,需要联系运维同事获取 + 'server': '', + # project_id/region_name: + # 项目ID/区域ID,获取方式参考链接 + # https://support.huaweicloud.com/api-iam/iam_17_0002.html + # 如果是计算中心,请咨询相关维护同事 + 'region_name': '', + 'project_id': '', + + # 如下配置针对计算中心等专有云 通用云不需要设置 设置为空 请咨询相关维护同事 + # 设置该信息后 需要设置相关的域名解析地址 + 'iam_endpoint': '', + 'obs_endpoint': '', + 'modelarts_endpoint' : '', +}) + +session_config = ed({ + # 运行模型的传入超参 + 'hyperparameters': [ + # 模型配置文件,默认acc模式,不需要更改 + {'label': 'config_path', 'value': 'resnet50_imagenet2012_Acc_config.yaml'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'label': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'label': 'run_distribute', 'value': 'True'}, + # epoch次数 必须关注 当前默认设置为90 + {'label': 'epoch_size', 'value': '90'}, + # device数量 云上场景一般不需要修改 + {'label': 'device_num', 'value': '8'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'label': 'save_checkpoint', 'value': 'False'}, + # 保存ckpt的epoch数 必须修改并注意 该值必须要跟epoch数一致 这样提高性能 + {'label': 'save_checkpoint_epochs', 'value': '90'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/imagenet_small/', + # obs代码路径 程序会自动拷贝到该路径 + 'code_dir': '/zgwtest/lcm_test/resnet/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/resnet/train.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'MindSpore-1.3-cann_5.0.2-python3.7-euleros2.8-aarch64', + + # 资源参数类型主要包括如下2个值 train_instance_type和pool_id + # 不设置pool_id 默认是公共池 设置了就是专属资源池 + # 只设置pool_id 不设置train_instance_type 默认为专属资源池的默认类型 + # train_instance_type 在程序打印中有提示的 一般为如下四个值 分别对应 1卡 2卡 4卡 8卡 + # ['modelarts.kat1.xlarge', 'modelarts.kat1.2xlarge', 'modelarts.kat1.4xlarge', 'modelarts.kat1.8xlarge'] + # https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0431.html 该链接指示获取方法 + + # 专属资源池id 不是则为None + 'pool_id': None, + # 训练类型 如下为8卡 如果是专属资源池id设置,那么该类型需要设置为None + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练结点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type': None, + # 'nas_share_addr': None, + # 'nas_mount_path': None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) + +session_config_v2 = ed({ + # 运行模型的传入超参 + 'parameters': [ + # 模型配置文件,默认boost模式,不需要更改 + {'name': 'config_path', 'value': 'resnet50_imagenet2012_Acc_config.yaml'}, + # 是否使能modelarts 必须设置为True,不需要修改 + {'name': 'enable_modelarts', 'value': 'True'}, + # 是否开启分布式,如果1卡以上的话都是True 一般不需要修改 + {'name': 'run_distribute', 'value': 'True'}, + # epoch次数 必须关注 当前默认设置为90 + {'name': 'epoch_size', 'value': '90'}, + # device数量 云上场景一般不需要修改 + {'name': 'device_num', 'value': '8'}, + # 是否保存ckpt文件 默认为True 保存ckpt + {'name': 'save_checkpoint', 'value': 'False'}, + # 保存ckpt的epoch数 必须修改并注意 该值必须要跟epoch数一致 这样提高性能 + {'name': 'save_checkpoint_epochs', 'value': '90'}, + ], + # 输入数据集obs目录,请按样例格式填写 + 'inputs': '/zgwtest/lcm_test/dataset/imagenet_small/', + + # obs代码路径 程序会自动拷贝到该路径. 和boot_files一起用于复合参数 training_files + 'code_dir': '/zgwtest/lcm_test/resnet/', + # 启动文件 必须要在code_dir路径下,请按样例格式填写 + 'boot_file': '/zgwtest/lcm_test/resnet/train.py', + + # 如下为运行相关参数 + # job名称 如果云环境Modelarts服务训练作业job队列中没有,则会新建一个job;若和已有job同名,则会在该job中,新建测试实例. + 'job_name': "aisbench-debug", + + # 使用容器类型与镜像版本 + 'framework_type': 'Ascend-Powered-Engine', + 'framework_version': 'mindspore_1.3.0-cann_5.0.2-py_3.7-euler_2.8.3-aarch64', + + # pool_id不设置或者设置为None, 默认是公共资源池。 设置了就表示是专属资源池。在ModelArts管理控制台,单击左侧“专属资源池”,在专属资源池列表中可以查看专属资源池ID,类似poolc90f063b + 'pool_id': None, + # 训练类型,默认8卡。 train_instance_type 在程序打印中有提示的,请注意紧随“get valid train_instance_types:”之后的打印输出. 由modelarts.estimatorV2 类Estimator的接口get_train_instance_types()查询而来。 + # 请参见https://support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0431.html 该链接指示获取方法。注意不同云环境查询的结果不同 + 'train_instance_type': 'modelarts.kat1.8xlarge', + # 训练节点数 + 'train_instance_count': 1, + + # 云存储路径 默认为空 + # 'nas_type': None, + # 'nas_share_addr': None, + # 'nas_mount_path': None, + + # 输出信息基准路径 整体路径为 train_url = out_base_url/version_name + "out_base_url": "/zgwtest/lcm_test/result/", + # job 描述前缀 + "job_description_prefix": 'lcm-debug desc', +}) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.10.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.10.patch new file mode 100644 index 0000000000000000000000000000000000000000..7d6905c7314e86aa0554627bbd7b849e69b945e4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.10.patch @@ -0,0 +1,162 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2023-04-10 14:53:38.020000000 +0800 ++++ code/src/model_utils/config.py 2023-04-10 14:53:38.040000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-10 14:53:38.020000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-10 14:53:38.040000000 +0800 +@@ -98,6 +98,12 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ + if pre_process: + pre_process() + +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-04-10 14:53:38.030000000 +0800 ++++ code/train.py 2023-04-10 14:53:38.050000000 +0800 +@@ -17,7 +17,7 @@ + import glob + import os + import numpy as np +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -285,6 +285,17 @@ + metrics_name="acc") + cb += [eval_cb] + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + def set_save_ckpt_dir(): + """set save ckpt dir""" +@@ -348,6 +359,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -366,12 +378,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.3.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.3.patch new file mode 100644 index 0000000000000000000000000000000000000000..3a036dce9b114a51b0ac63ae2c96bfda8da8dadb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.3.patch @@ -0,0 +1,230 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2022-07-11 15:10:57.060000000 +0800 ++++ code/src/model_utils/config.py 2022-07-11 15:10:57.070000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '..', '..', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + pprint(default) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-07-11 15:10:57.060000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-07-11 15:10:57.070000000 +0800 +@@ -98,6 +98,12 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ + if pre_process: + pre_process() + +@@ -111,5 +117,6 @@ + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) ++ + return wrapped_func + return wrapper +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-07-11 15:10:57.060000000 +0800 ++++ code/train.py 2022-07-11 15:10:57.070000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + from mindspore import context + from mindspore import Tensor, Parameter +@@ -47,6 +48,61 @@ + + set_seed(1) + ++import json ++import time ++from mindspore.train.callback._callback import Callback ++ ++skip_sumary_count = 1 ++skip_data_sum = 0 ++skip_time_sum = 0 ++real_start_time = 0.0 ++all_data_sum = 0 ++ ++class ThroughputRate(Callback): ++ """ ++ Monitor the time in training. ++ ++ Args: ++ data_size (int): Dataset size. Default: None. ++ """ ++ ++ def __init__(self, data_size=None): ++ super(ThroughputRate, self).__init__() ++ self.data_size = data_size ++ self.count = 0 ++ ++ def epoch_begin(self, run_context): ++ self.epoch_time = time.time() ++ global real_start_time ++ if self.count == 1: ++ real_start_time = self.epoch_time ++ self.count += 1 ++ ++ def epoch_end(self, run_context): ++ epoch_seconds = (time.time() - self.epoch_time) * 1000 ++ step_size = self.data_size ++ cb_params = run_context.original_args() ++ if hasattr(cb_params, "batch_num"): ++ batch_num = cb_params.batch_num ++ if isinstance(batch_num, int) and batch_num > 0: ++ step_size = cb_params.batch_num ++ ++ if not isinstance(step_size, int) or step_size < 1: ++ logger.error("data_size must be positive int.") ++ return ++ ++ global skip_sumary_count ++ global skip_data_sum ++ global skip_time_sum ++ global all_data_sum ++ if skip_sumary_count > 0: ++ skip_sumary_count = skip_sumary_count -1 ++ skip_data_sum += self.data_size * config.batch_size ++ skip_time_sum += (time.time() - self.epoch_time) ++ all_data_sum += self.data_size * config.batch_size ++ print("data_size:{} batch_size:{} datasum:{} skipcount:{} skip_data_sum:{} skip_time_sum:{}".format( ++ self.data_size, config.batch_size, all_data_sum, skip_sumary_count, skip_data_sum, skip_time_sum)) ++ + + class LossCallBack(LossMonitor): + """ +@@ -311,6 +367,17 @@ + cb += [eval_cb] + + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res ++ ++ + def set_save_ckpt_dir(): + """set save ckpt dir""" + ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) +@@ -373,7 +440,7 @@ + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +- cb = [time_cb, loss_cb] ++ cb = [time_cb, loss_cb, ThroughputRate(data_size=step_size)] + ckpt_save_dir = set_save_ckpt_dir() + if config.save_checkpoint: + ckpt_append_info = [{"epoch_num": config.has_trained_epoch, "step_num": config.has_trained_step}] +@@ -388,12 +455,94 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ throughput_rate = (all_data_sum - skip_data_sum ) / (end_time - real_start_time) ++ print("train done starttime:{} real:{} endtime:{} alldatasum:{} skipcount:{} skiptime:{}".format( ++ start_time, real_start_time, end_time, all_data_sum, skip_data_sum, skip_time_sum)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("BATCH_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("BATCH_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.5.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.5.patch new file mode 100644 index 0000000000000000000000000000000000000000..f9aa3cc5a6590b2aeea3a8eb51174866c63f220f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.5.patch @@ -0,0 +1,162 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2022-07-11 17:36:33.370000000 +0800 ++++ code/src/model_utils/config.py 2022-07-11 17:36:33.380000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-07-11 17:36:33.370000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-07-11 17:36:33.380000000 +0800 +@@ -97,6 +97,12 @@ + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) ++ ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) + + if pre_process: + pre_process() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-07-11 17:36:33.370000000 +0800 ++++ code/train.py 2022-07-11 17:36:33.390000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + from mindspore import context + from mindspore import Tensor +@@ -296,6 +297,17 @@ + metrics_name="acc") + cb += [eval_cb] + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + def set_save_ckpt_dir(): + """set save ckpt dir""" +@@ -358,6 +370,8 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) ++ + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -376,12 +390,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("BATCH_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("BATCH_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.7.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.7.patch new file mode 100644 index 0000000000000000000000000000000000000000..59382103d4aa8fa22fa71eeee56a05ead3d9f62f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.7.patch @@ -0,0 +1,162 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2022-05-22 20:29:23.680000000 +0800 ++++ code/src/model_utils/config.py 2022-05-22 20:29:23.710000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-05-22 20:29:23.680000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-05-22 20:29:23.710000000 +0800 +@@ -97,6 +97,12 @@ + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) ++ ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) + + if pre_process: + pre_process() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-05-22 20:29:23.710000000 +0800 ++++ code/train.py 2022-05-22 20:29:23.740000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + import mindspore as ms + from mindspore import Tensor +@@ -294,6 +295,17 @@ + metrics_name="acc") + cb += [eval_cb] + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + def set_save_ckpt_dir(): + """set save ckpt dir""" +@@ -356,6 +368,8 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) ++ + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -374,12 +388,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.8.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.8.patch new file mode 100644 index 0000000000000000000000000000000000000000..73ab6b949155fb71bb02155460a13e028e1c3890 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.8.patch @@ -0,0 +1,166 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2023-04-10 13:02:19.630000000 +0800 ++++ code/src/model_utils/config.py 2023-04-10 13:02:19.650000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-10 13:02:19.630000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-10 13:02:19.650000000 +0800 +@@ -98,6 +98,13 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ + if pre_process: + pre_process() + +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-04-10 13:02:19.640000000 +0800 ++++ code/train.py 2023-04-10 13:02:19.660000000 +0800 +@@ -17,7 +17,7 @@ + import glob + import os + import numpy as np +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -288,6 +288,17 @@ + metrics_name="acc") + cb += [eval_cb] + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + def set_save_ckpt_dir(): + """set save ckpt dir""" +@@ -351,6 +362,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -369,12 +381,98 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ ++ + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..3c7a8e05a1fa5860c7429402c17f8b5a88f09f40 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r1.9.patch @@ -0,0 +1,163 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2022-10-17 21:17:23.696000000 +0800 ++++ code/src/model_utils/config.py 2022-10-17 21:17:23.716000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2022-10-17 21:17:23.696000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2022-10-17 21:17:23.716000000 +0800 +@@ -98,6 +98,13 @@ + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ ++ + if pre_process: + pre_process() + +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-10-17 21:17:23.684000000 +0800 ++++ code/train.py 2022-10-17 21:17:23.700000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + import mindspore as ms + import mindspore.nn as nn +@@ -286,6 +287,18 @@ + cb += [eval_cb] + + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res ++ + def set_save_ckpt_dir(): + """set save ckpt dir""" + ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) +@@ -348,6 +361,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -366,12 +380,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) + ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os + ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.0.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..4956e0f6e0062203ea9f9c35a299d5d09cc9c2cb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.0.patch @@ -0,0 +1,164 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2023-04-07 20:52:22.470000000 +0800 ++++ code/src/model_utils/config.py 2023-04-07 20:52:22.490000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-07 20:52:22.470000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-07 20:52:22.490000000 +0800 +@@ -101,6 +101,13 @@ + if pre_process: + pre_process() + ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ ++ + run_func(*args, **kwargs) + + # Upload data to train_url +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-04-07 20:52:22.480000000 +0800 ++++ code/train.py 2023-04-07 20:52:22.500000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -132,6 +132,17 @@ + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + return loss + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + @moxing_wrapper() + def train_net(): +@@ -197,6 +208,8 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -223,11 +236,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.1.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.1.patch new file mode 100644 index 0000000000000000000000000000000000000000..4956e0f6e0062203ea9f9c35a299d5d09cc9c2cb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.1.patch @@ -0,0 +1,164 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2023-04-07 20:52:22.470000000 +0800 ++++ code/src/model_utils/config.py 2023-04-07 20:52:22.490000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-07 20:52:22.470000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-07 20:52:22.490000000 +0800 +@@ -101,6 +101,13 @@ + if pre_process: + pre_process() + ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ ++ + run_func(*args, **kwargs) + + # Upload data to train_url +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-04-07 20:52:22.480000000 +0800 ++++ code/train.py 2023-04-07 20:52:22.500000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -132,6 +132,17 @@ + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + return loss + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + @moxing_wrapper() + def train_net(): +@@ -197,6 +208,8 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -223,11 +236,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.2.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.2.patch new file mode 100644 index 0000000000000000000000000000000000000000..4956e0f6e0062203ea9f9c35a299d5d09cc9c2cb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/modelarts_r2.2.patch @@ -0,0 +1,164 @@ +diff -Nur origin/src/model_utils/config.py code/src/model_utils/config.py +--- origin/src/model_utils/config.py 2023-04-07 20:52:22.470000000 +0800 ++++ code/src/model_utils/config.py 2023-04-07 20:52:22.490000000 +0800 +@@ -120,6 +120,8 @@ + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \ + "../../config/resnet50_cifar10_config.yaml"), help="Config file path") + path_args, _ = parser.parse_known_args() ++ if not os.path.exists(path_args.config_path): ++ path_args.config_path = os.path.join(current_dir, '../../config', path_args.config_path) + default, helper, choices = parse_yaml(path_args.config_path) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) +diff -Nur origin/src/model_utils/moxing_adapter.py code/src/model_utils/moxing_adapter.py +--- origin/src/model_utils/moxing_adapter.py 2023-04-07 20:52:22.470000000 +0800 ++++ code/src/model_utils/moxing_adapter.py 2023-04-07 20:52:22.490000000 +0800 +@@ -101,6 +101,13 @@ + if pre_process: + pre_process() + ++ base_path = config.data_path ++ if os.path.exists(os.path.join(base_path, "train")): ++ config.data_path = os.path.join(base_path, "train") ++ if os.path.exists(os.path.join(base_path, "val")): ++ config.eval_dataset_path = os.path.join(os.path.join(base_path, "val")) ++ ++ + run_func(*args, **kwargs) + + # Upload data to train_url +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-04-07 20:52:22.480000000 +0800 ++++ code/train.py 2023-04-07 20:52:22.500000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -132,6 +132,17 @@ + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + return loss + ++def run_eval_ckpt(target, model): ++ eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, ++ batch_size=config.batch_size, train_image_size=config.train_image_size, ++ eval_image_size=config.eval_image_size, ++ target=target, enable_cache=config.enable_cache, ++ cache_session_id=config.cache_session_id) ++ eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "acc"} ++ print("eval ckpt begin") ++ res = apply_eval(eval_param_dict) ++ print("eval ckpt result:{}".format(res)) ++ return res + + @moxing_wrapper() + def train_net(): +@@ -197,6 +208,8 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -223,11 +236,95 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ end_time = time.time() ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ throughput_rate = all_data_sum / (int)(end_time - start_time) ++ print("train done starttime:{} endtime:{} stepsize:{} batchsize:{} epochsize:{} alldatasum:{} single_throughput_rate:{}".format( ++ start_time, end_time, step_size, config.batch_size, config.epoch_size, all_data_sum, throughput_rate)) ++ import moxing as mox ++ result_url = config.train_url ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ result_url = os.path.join(config.train_url, os.getenv("VC_TASK_INDEX", "0")) ++ mox.file.make_dirs(result_url) ++ throughput_file = os.path.join(result_url, "throughput_" + str(get_rank_id()) + ".json") ++ mox.file.write(throughput_file, str(throughput_rate)) ++ ++ best_res = run_eval_ckpt(target, model) ++ if get_rank_id() == 0: ++ import moxing as mox ++ server_id = os.getenv("VC_TASK_INDEX", 0) ++ accuracy_file = os.path.join(config.train_url, "accuracy_{}.json".format(server_id)) ++ mox.file.write(accuracy_file, str(best_res)) ++ accuracy_file1 = os.path.join(config.train_url, "accuracy.json") ++ with mox.file.File(accuracy_file1, 'w') as f: ++ f.write(str(best_res)) ++ ranksize_file = os.path.join(config.train_url, "ranksize.json") ++ if mox.file.exists(ranksize_file) == False: ++ mox.file.write(ranksize_file, str(get_device_num())) ++ + + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + ++import json ++import time ++import os ++ ++class FileOperator: ++ @staticmethod ++ def create_empty_file(file_path): ++ f = open(file_path, "w") ++ f.close() ++ ++ @staticmethod ++ def read_json_to_dict(file_path): ++ f = open(file_path, 'r') ++ dic = json.load(f) ++ f.close() ++ return dic ++ ++ @staticmethod ++ def write_json_from_dict(file_path, source_dict): ++ json_str = json.dumps(source_dict) ++ with open(file_path, 'w') as json_file: ++ json_file.write(json_str) ++ ++def generate_single_node_rank_table(rank_table, old_rank_id): ++ import copy ++ server_list = copy.deepcopy(rank_table['server_list']) ++ server_index = old_rank_id // 8 ++ cur_server = server_list[server_index] ++ for device in cur_server['device']: ++ device['rank_id'] = str(int(device['rank_id']) % 8) ++ rank_table['server_list'] = [cur_server] ++ rank_table['server_count'] = "1" ++ FileOperator.write_json_from_dict(os.getenv("RANK_TABLE_FILE"), rank_table) ++ ++def set_singleserver_mode(): ++ old_rank_id = int(os.environ['RANK_ID']) ++ new_rank_id = old_rank_id % 8 ++ os.environ['RANK_ID'] = str(new_rank_id) ++ os.environ['DEVICE_ID'] = str(new_rank_id) ++ os.environ["RANK_SIZE"] = str(8) ++ #os.environ["SERVER_ID"] = str(old_rank_id // 8) ++ ++ empty_file_path = "/tmp/tmp.txt" ++ if new_rank_id != 0: ++ while not os.path.exists(empty_file_path): ++ time.sleep(1) ++ else: ++ rank_table = FileOperator.read_json_to_dict(os.getenv("RANK_TABLE_FILE")) ++ generate_single_node_rank_table(rank_table, old_rank_id) ++ FileOperator.create_empty_file(empty_file_path) ++ return new_rank_id ++ + + if __name__ == '__main__': ++ if os.getenv("SINGLESERVER_MODE", "") == "True": ++ set_singleserver_mode() ++ print("singleserver_mode set") ++ else: ++ print("singleserver_mode not set") + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..c79dc7469a6ef4604f5c85e8016d759d29d59d98 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/patch.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r2.3"; } + + modelzoo_sub_dir="mindspore/model_zoo/official/cv/resnet" + if [ "$branch_args" == "r1.1" ];then + branch="r1.1" + patch_file_name="r1.1" + commitid="9c133b6f709e12ed7085c31f028e7c925ee57828" + git_url="https://gitee.com/mindspore/mindspore.git" + elif [ "$branch_args" == "r1.2" ];then + branch="r1.2" + patch_file_name="r1.2" + commitid="cd002779dc5e2bc2da85b9a33e8950aa3bb50ed2" + git_url="https://gitee.com/mindspore/mindspore.git" + elif [ "$branch_args" == "r1.3" ];then + branch="r1.3" + patch_file_name="r1.3" + commitid="d9d4960262617d964d669ef8e3287daf347d5a7c" + git_url="https://gitee.com/mindspore/mindspore.git" + elif [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="f6537762d0aea541adfed8644da452a476c28321" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/resnet" + elif [ "$branch_args" == "r1.6" ];then + branch="r1.6" + patch_file_name="r1.6" + commitid="6496c699bd404076b12a6edcc40889dafaeb5285" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/resnet" + elif [ "$branch_args" == "r1.7" ];then + branch="master" + patch_file_name="r1.7" + commitid="3406fdabaee92f1b22ce0703fa25befa3c40d18e" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/resnet" + elif [ "$branch_args" == "r1.8" ];then + branch="master" + patch_file_name="r1.8" + commitid="b68b6bfa919465567d89bc7fdcf6d0e63967d5aa" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/resnet" + elif [ "$branch_args" == "r1.9" ];then + branch="r1.9" + patch_file_name="r1.9" + commitid="5318681496ef9a37d337737325ad1b238ef75917" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/resnet" + elif [ "$branch_args" == "r1.10" ];then + branch="r1.10" + patch_file_name="r1.10" + commitid="8f7331e6a846e7c306dc8ac30313d9f07cf6ee98" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/resnet" + elif [ "$branch_args" == "r2.0" ];then + branch="r2.0" + patch_file_name="r2.0" + commitid="f211f336e8bee3cf531bcad5f611f408069c6f9f" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/ResNet" + elif [ "$branch_args" == "r2.1" ];then + branch="r2.1" + patch_file_name="r2.1" + commitid="44f2dc18e9bd52c6bcadd18f6567817ad798f641" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/ResNet" + elif [ "$branch_args" == "r2.2" ];then + branch="master" + patch_file_name="r2.2" + commitid="bb9ab4fdfb2fc205ffeb4dd671be77312908ef88" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/ResNet" + elif [ "$branch_args" == "r2.3" ];then + branch="master" + patch_file_name="r2.3" + commitid="c63fb183e748427c2d59d96e5a79f9543f56844d" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/ResNet" + else + echo "bad parameters : $1" + return $ret_error + fi + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.10.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.10.patch new file mode 100644 index 0000000000000000000000000000000000000000..731f70acf08137ce81b7427e8c71cb19a4213a06 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.10.patch @@ -0,0 +1,85 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-12-16 14:40:00.810000000 +0800 ++++ code/eval.py 2022-12-16 14:40:00.850000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-12-16 14:40:00.830000000 +0800 ++++ code/train.py 2022-12-16 14:40:00.870000000 +0800 +@@ -17,7 +17,7 @@ + import glob + import os + import numpy as np +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -35,7 +35,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_rank_id, get_device_num + from src.resnet import conv_variance_scaling_initializer +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + ms.set_seed(1) + + +@@ -347,7 +352,7 @@ + amp_level="O3") + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") +- ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -366,8 +371,30 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.5.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.5.patch new file mode 100644 index 0000000000000000000000000000000000000000..9b88725c23e0fd98cff1921f8cee51299570d553 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.5.patch @@ -0,0 +1,87 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2021-12-10 14:54:43.810000000 +0800 ++++ code/eval.py 2021-12-10 14:54:43.820000000 +0800 +@@ -84,6 +84,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2021-12-10 14:54:43.820000000 +0800 ++++ code/train.py 2021-12-10 14:54:43.820000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + from mindspore import context + from mindspore import Tensor +@@ -43,7 +44,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_rank_id, get_device_num + from src.resnet import conv_variance_scaling_initializer +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + set_seed(1) + +@@ -358,6 +364,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -376,9 +383,34 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() ++ + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) + ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) ++ + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.6.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.6.patch new file mode 100644 index 0000000000000000000000000000000000000000..10bcb6efb0b52e2c34badebe226b381a20205058 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.6.patch @@ -0,0 +1,88 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2021-12-10 11:02:30.430000000 +0800 ++++ code/eval.py 2021-12-10 11:02:30.440000000 +0800 +@@ -84,6 +84,16 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) ++ + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2021-12-10 11:02:30.430000000 +0800 ++++ code/train.py 2021-12-10 11:02:30.440000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + from mindspore import context + from mindspore import Tensor +@@ -43,7 +44,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_rank_id, get_device_num + from src.resnet import conv_variance_scaling_initializer +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + set_seed(1) + +@@ -358,6 +364,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -376,9 +383,34 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() ++ + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) + ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) ++ + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.7.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.7.patch new file mode 100644 index 0000000000000000000000000000000000000000..408f74f28514d26b89ced7f24a6313e23b77b8c3 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.7.patch @@ -0,0 +1,83 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-05-30 13:48:49.358918532 +0800 ++++ code/eval.py 2022-05-30 13:48:49.374918725 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-05-30 13:48:49.362918580 +0800 ++++ code/train.py 2022-05-30 13:48:49.374918725 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + import mindspore as ms + import mindspore.nn as nn +@@ -35,6 +36,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_rank_id, get_device_num + from src.resnet import conv_variance_scaling_initializer ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -351,6 +358,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -369,9 +377,31 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) + ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.8.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.8.patch new file mode 100644 index 0000000000000000000000000000000000000000..b4917a88a801dde3d1b60318d9092e99a8a39d0e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.8.patch @@ -0,0 +1,83 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-07-07 09:59:36.316000000 +0800 ++++ code/eval.py 2022-07-07 09:59:36.336000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-07-07 09:59:36.316000000 +0800 ++++ code/train.py 2022-07-07 09:59:36.336000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + import mindspore as ms + import mindspore.nn as nn +@@ -35,6 +36,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_rank_id, get_device_num + from src.resnet import conv_variance_scaling_initializer ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -350,7 +357,7 @@ + amp_level="O2", keep_batchnorm_fp32=False) + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") +- ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -369,8 +376,30 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..1710c009d94160dabf48efa85f0c67ce4e0f4ac6 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r1.9.patch @@ -0,0 +1,82 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-10-17 20:39:19.284000000 +0800 ++++ code/eval.py 2022-10-17 20:39:19.292000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-10-17 20:39:19.284000000 +0800 ++++ code/train.py 2022-10-17 20:39:19.292000000 +0800 +@@ -17,6 +17,7 @@ + import glob + import os + import numpy as np ++import time + + import mindspore as ms + import mindspore.nn as nn +@@ -35,6 +36,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_rank_id, get_device_num + from src.resnet import conv_variance_scaling_initializer ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -348,6 +355,7 @@ + config.run_eval = False + logger.warning("Thor optimizer not support evaluation while training.") + ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size - config.pretrain_epoch_size) + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossCallBack(config.has_trained_epoch) +@@ -366,8 +374,30 @@ + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.pretrain_epoch_size = config.has_trained_epoch ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if config.run_eval and config.enable_cache: + print("Remember to shut down the cache server via \"cache_admin --stop\"") diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.0.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..9fd65abeb800bc77ecc53c9b2b7e1e8361cac542 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.0.patch @@ -0,0 +1,91 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2023-03-27 15:53:34.360000000 +0800 ++++ code/eval.py 2023-03-27 15:53:34.388000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-03-27 15:53:34.364000000 +0800 ++++ code/train.py 2023-03-27 15:53:34.388000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -31,6 +31,12 @@ + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -197,6 +203,7 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -217,15 +224,36 @@ + cache_session_id=config.cache_session_id) + eval_cb = eval_callback(model, config, eval_dataset) + cb.append(eval_cb) +- ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) + # train model + if config.net_name == "se-resnet50": + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) +- ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.1.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.1.patch new file mode 100644 index 0000000000000000000000000000000000000000..9fd65abeb800bc77ecc53c9b2b7e1e8361cac542 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.1.patch @@ -0,0 +1,91 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2023-03-27 15:53:34.360000000 +0800 ++++ code/eval.py 2023-03-27 15:53:34.388000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-03-27 15:53:34.364000000 +0800 ++++ code/train.py 2023-03-27 15:53:34.388000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -31,6 +31,12 @@ + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -197,6 +203,7 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -217,15 +224,36 @@ + cache_session_id=config.cache_session_id) + eval_cb = eval_callback(model, config, eval_dataset) + cb.append(eval_cb) +- ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) + # train model + if config.net_name == "se-resnet50": + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) +- ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.2.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.2.patch new file mode 100644 index 0000000000000000000000000000000000000000..9fd65abeb800bc77ecc53c9b2b7e1e8361cac542 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.2.patch @@ -0,0 +1,91 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2023-03-27 15:53:34.360000000 +0800 ++++ code/eval.py 2023-03-27 15:53:34.388000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-03-27 15:53:34.364000000 +0800 ++++ code/train.py 2023-03-27 15:53:34.388000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -31,6 +31,12 @@ + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -197,6 +203,7 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -217,15 +224,36 @@ + cache_session_id=config.cache_session_id) + eval_cb = eval_callback(model, config, eval_dataset) + cb.append(eval_cb) +- ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) + # train model + if config.net_name == "se-resnet50": + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) +- ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.3.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.3.patch new file mode 100644 index 0000000000000000000000000000000000000000..9fd65abeb800bc77ecc53c9b2b7e1e8361cac542 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/r2.3.patch @@ -0,0 +1,91 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2023-03-27 15:53:34.360000000 +0800 ++++ code/eval.py 2023-03-27 15:53:34.388000000 +0800 +@@ -82,6 +82,15 @@ + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", config.checkpoint_file_path) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res['top_1_accuracy'])) + + if __name__ == '__main__': + eval_net() +diff -Nur origin/train.py code/train.py +--- origin/train.py 2023-03-27 15:53:34.364000000 +0800 ++++ code/train.py 2023-03-27 15:53:34.388000000 +0800 +@@ -14,7 +14,7 @@ + # ============================================================================ + """train resnet.""" + import os +- ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore.train.train_thor import ConvertModelUtils +@@ -31,6 +31,12 @@ + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper + from src.model_utils.device_adapter import get_device_num ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + ms.set_seed(1) + +@@ -197,6 +203,7 @@ + ms.load_param_into_net(opt, resume_param) + config.logger.info('resume train from epoch: %s', config.start_epoch) + ++ + # define callbacks + loss_cb = LossCallBack(config.epoch_size, config.logger, lr, per_print_time=10) + resume_cb = ResumeCallback(config.start_epoch) +@@ -217,15 +224,36 @@ + cache_session_id=config.cache_session_id) + eval_cb = eval_callback(model, config, eval_dataset) + cb.append(eval_cb) +- ++ model.build(dataset, None, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) + # train model + if config.net_name == "se-resnet50": + config.epoch_size = config.train_epoch_size + dataset_sink_mode = (not config.parameter_server) and target != "CPU" + config.logger.save_args(config) ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size - config.start_epoch, dataset, callbacks=cb, + sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode) +- ++ all_data_sum = step_size * config.batch_size * config.epoch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + config.logger.info("If run eval and enable_cache Remember to shut down the cache server via \"cache_admin --stop\"") + + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..1bb080dd86d25ef143a536917b5234bc068b9ca4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/cluster_offline_run.sh @@ -0,0 +1,81 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + : "${EVAL_DATA_PATH?EVAL_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + + [ -d $BASE_PATH/result ] && cp ${RESULT_PATH}/* -rf $BASE_PATH/result/ + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/modelarts_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/modelarts_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2c0ca527c3046897a8a499e16523e7003105dc37 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/modelarts_run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh + +init() +{ + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + logger_Info "init called" +} + +run_train() +{ + logger_Info "run_train called" + [ ! -f $CODE_PATH/code/ma-pre-start.sh ] && touch $CODE_PATH/code/ma-pre-start.sh + sed -i '/SINGLESERVER_MODE=/d' $CODE_PATH/code/ma-pre-start.sh + + [[ $MODELARTS_VERSION ]]&&[[ $MODELARTS_VERSION == "V2" ]] && modelarts_version="V2" || modelarts_version="V1" + + + if [ "$SINGLESERVER_MODE" == "True" ];then + echo "now set singleserver_mode OK" + echo -e "\nexport SINGLESERVER_MODE=True" >> $CODE_PATH/code/ma-pre-start.sh + + ${PYTHON_COMMAND} -u ${CODE_PATH}/common/train_modelarts.py --local_code_path $CODE_PATH/code --single_server_mode --modelarts_version $modelarts_version || { logger_Warn "run train modelarts failed ret:$?";return 1; } + else + echo "now not set singleserver_mode" + ${PYTHON_COMMAND} -u ${CODE_PATH}/common/train_modelarts.py --local_code_path $CODE_PATH/code --modelarts_version $modelarts_version || { logger_Warn "run train modelarts failed ret:$?";return 1; } + fi + ${PYTHON_COMMAND} $CODE_PATH/ais_utils.py "training" "result" "OK" +} + +run_eval() +{ + logger_Info "run_eval called" +} + +get_result() +{ + logger_Info "get_result called" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..9cd1b0f4f71736e5df5623ff57477d9932944698 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_resnet/scripts/run_node.sh @@ -0,0 +1,110 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# ��ȡѵ������ +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="True" || DISTRUTE_ENABLE="False" + # 基准代码r2.0.0版本中训练配置文件resnet50_imagenet2012_Boost_config.yaml中,将训练参数output_path改为output_dir + CONFIG_PATH=$WORK_PATH/code/config/resnet50_imagenet2012_Boost_config.yaml + isexisted=`cat $CONFIG_PATH |grep "output_dir" |grep -v grep |awk -F= 'NR==1{print $NF}'` + if [ ! -n "$isexisted" ]; then + OUTPUT_PARA_NAME="output_path" + else + OUTPUT_PARA_NAME="output_dir" + fi + + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py \ + --run_distribute=$DISTRUTE_ENABLE \ + --data_path=${TRAIN_DATA_PATH} \ + --device_num=${DEVICE_NUM} \ + --epoch_size=${EPOCH_SIZE} \ + --$OUTPUT_PARA_NAME="$RUN_PATH" \ + --save_checkpoint=True \ + --save_checkpoint_epochs=${EPOCH_SIZE} \ + --config_path=$CONFIG_PATH + " + # # for mindspore1.5 + # export ENV_FUSION_CLEAR=1 + # export ENV_SINGLE_EVAL=1 + # export SKT_ENABLE=1 + chipname=`npu-smi info -t board -i 0 -c 0 | grep 'Chip Name' | awk {'print $4'}` + export MS_DISABLE_REF_MODE=1 + export MS_ENABLE_FORMAT_MODE=0 + export MS_ASCEND_CHECK_OVERFLOW_MODE="SATURATION_MODE" +} + +function get_eval_cmd() +{ + chipname=`npu-smi info -t board -i 0 -c 0 | grep 'Chip Name' | awk {'print $4'}` + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --data_path=${EVAL_DATA_PATH} \ + --config_path=$WORK_PATH/code/config/resnet50_imagenet2012_Boost_config.yaml \ + --checkpoint_file_path=${CHECKPOINT_PATH}" + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + + check_path_valid "${EVAL_DATA_PATH}" || { logger_Warn "EVAL_DATA_PATH:${EVAL_DATA_PATH} not valid path" ; return 1; } + logger_Debug "EVAL_DATA_PATH is valid" +} + +function node_train() +{ + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + cp $CHECKPOINT_PATH $RESULT_PATH/ + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..abe3be71802a849f3c732984e547f4468e7435e5 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/build.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + + mkdir -p ${CURDIR}/output/config + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/config.sh -r ${CURDIR}//output/config/ + cp ${CURDIR}/config/modelarts_config.py -r ${CURDIR}//output/config/ + [ "$1" == "r1.3" ] && { cp ${CURDIR}/config/modelarts_config.py.r1.3 -r ${CURDIR}//output/config/modelarts_config.py; } + [ -d ${CURDIR}/doc ] && cp ${CURDIR}/doc -r ${CURDIR}/output/ + + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..6520d4fc8bea8d35a129ce2753ad3cec01338904 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/config/config.sh @@ -0,0 +1,20 @@ +export PYTHON_COMMAND=python3.7 +export TRAIN_DATA_PATH=/home/datasets/coco/ +export EVAL_DATA_PATH=/home/datasets/coco/ +export CONFIG_PATH=./config/ssd_vgg16_config.yaml + +#export PRETRAIN_MODEL_PATH=/home/models/ms_bert_large.ckpt + +export LR=0.05 +export EPOCH_SIZE=500 +export TRAIN_STEPS=12000 + +# 8p +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# options needed only if rank_size > 1 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json + +# needed only in cluster mode +# export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..76749c19bf233fdcb805e7d8dcdd5e5a39032bb7 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.9" ];then + branch="master" + patch_file_name="r1.9" + commitid="f4eed0f958b40992ee96dd6cfefd76ae989c872f" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/cv/ssd" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..a9d32e60065ad7278786e47f2e82a0490f569c6c --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/r1.9.patch @@ -0,0 +1,76 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-09-26 09:51:49.660000000 +0800 ++++ code/eval.py 2022-09-26 09:51:49.668000000 +0800 +@@ -58,6 +58,16 @@ + mAP = apply_eval(eval_param_dict) + print("\n========================================\n") + print(f"mAP: {mAP}") ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ print("ACC_DIR:", ACC_DIR) ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(mAP)) + + @moxing_wrapper() + def eval_net(): +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-09-26 09:51:49.664000000 +0800 ++++ code/train.py 2022-09-26 09:51:49.668000000 +0800 +@@ -16,6 +16,7 @@ + """Train SSD and get checkpoint files.""" + + import os ++import time + import mindspore as ms + import mindspore.nn as nn + from mindspore import Tensor +@@ -36,6 +37,12 @@ + from src.model_utils.moxing_adapter import moxing_wrapper + + set_seed(1) ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def ssd_model_build(): + if config.model_name == "ssd300": +@@ -198,7 +205,31 @@ + print("In sink mode, one epoch return a loss.") + dataset_sink_mode = True + print("Start train SSD, the first epoch will be slower because of the graph compilation.") ++ model.build(dataset, sink_size=dataset.get_dataset_size(), epoch=config.epoch_size) ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(config.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode) ++ all_data_sum = config.epoch_size * dataset.get_dataset_size() * config.batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum / (end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + if __name__ == '__main__': + train_net() diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..d72557eb7ae6ba01fa1ce4fba4f7bbe69989c190 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + : "${EVAL_DATA_PATH?EVAL_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + + [ -d $BASE_PATH/result ] && cp ${RESULT_PATH}/* -rf $BASE_PATH/result/ + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..d4f38149ff2157035d4a32f1db5831e655bfb24c --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_ssd/scripts/run_node.sh @@ -0,0 +1,107 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="true" || DISTRUTE_ENABLE="false" + + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/run_pretrain.py \ + --distribute=$DISTRUTE_ENABLE \ + --epoch_size=$EPOCH_SIZE \ + --enable_save_ckpt=true \ + --enable_lossscale=true \ + --do_shuffle=true \ + --enable_data_sink=true \ + --data_sink_steps=100 \ + --accumulation_steps=1 \ + --save_checkpoint_path=$RUN_PATH \ + --save_checkpoint_steps=$TRAIN_STEPS \ + --save_checkpoint_num=1 \ + --load_checkpoint_path=$PRETRAIN_MODEL_PATH \ + --data_dir=${TRAIN_DATA_PATH} \ + --device_id=${DEVICE_ID} \ + --device_num=${DEVICE_NUM} \ + --train_steps=${TRAIN_STEPS} \ + --config_path=$WORK_PATH/code/pretrain_config_Ascend_Boost.yaml + " + return 0 +} + +function get_eval_cmd() +{ + CONFIG_FILE=$WORK_PATH/code/pretrain_config_Ascend_Boost.yaml + sed -i "s|eval_data_dir:.*|eval_data_dir: '$EVAL_DATA_PATH'|g" "$CONFIG_FILE" + sed -i "s|schema_file:.*|schema_file: null|g" "$CONFIG_FILE" + sed -i "s|eval_ckpt:.*|eval_ckpt: '$CHECKPOINT_PATH'|g" "$CONFIG_FILE" + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/pretrain_eval.py --config_path=$CONFIG_FILE" + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_file_valid ${PRETRAIN_MODEL_PATH} || { logger_Warn "PRETRAIN_MODEL_PATH:${PRETRAIN_MODEL_PATH} not valid" ; return 1; } + logger_Debug "PRETRAIN_MODEL_PATH path valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + + check_path_valid "${EVAL_DATA_PATH}" || { logger_Warn "EVAL_DATA_PATH:${EVAL_DATA_PATH} not valid path" ; return 1; } + logger_Debug "EVAL_DATA_PATH is valid" +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + cp $CHECKPOINT_PATH $RESULT_PATH/ + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..065998ce7d27c07c80771ff446961caaac88439b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..7ec5482521c054694583acd25d623ef948e3264e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/config/config.sh @@ -0,0 +1,12 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7 +export TRAIN_DATA_FILE=/home/datasets/criteo/mini_demo.txt + +export RANK_SIZE=8 +export DEVICE_NUM=8 + +# need if rank_size > 1 +export RANK_TABLE_FILE=/home/lcm/tool/rank_table_16p_62_64.json +# cluster need for node info +#export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + diff --git "a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/doc/ais-bench+mindspore-widedeep\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/doc/ais-bench+mindspore-widedeep\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000000000000000000000000000000000..aa00154d68fd5e9e707e4fff9211cdb75ac4ec42 --- /dev/null +++ "b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/doc/ais-bench+mindspore-widedeep\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,63 @@ +# Ais-Bench+Mindspore+widedeep使用说明 + +## 简介 + +AI Server Benchmark 是按《信息技术 人工智能 服务器系统性能测试规范》对人工智能服务器系统的性能进行性能评估的测试系统(测试套件),简称Ais-Bench软件。 + +## 使用前提 + +本程序包运行需要基于以下前提 + +1. Atlas 800-9000设备 +2. 安装好CANN包和Mindspore对应版本。并可以运行正常mindspore测试程序。 +3. 保存数据集和相关预处理文件等到设备中。 + +## 集群节点配置 + +如果运行设备大于1个设备,那么需要运行设置ssh节点文件。说明节点信息 +{ +"cluster": { +"xx.xx.xx.xx": { # 节点ip 必须与ranktable中对应 +"user": "xxxx", # 用户名 免密可以不用设置 +"pd": "xx", # 密码 免密不用设置 +"port": xx # 端口 默认22 可以不用设置 +}, +"xx.xx.xx.xx": { +"user": "xxxx", +"pd": "xx", +"port": xx +} +} +} + +## 集群节点免密设置 + +设置密钥认证的参考操作如下: +ssh-keygen -t rsa -b 2048 # 登录管理节点并生成SSH Key。安全起见,建议用户到"Enter passphrase"步骤时输入密钥密码,且符合密码复杂度要求。建议执行这条命令前先将umask设置为0077,执行完后再恢复原来umask值。 +ssh-copy-id -i ~/.ssh/id_rsa.pub ``@`` # 将管理节点的公钥拷贝到所有节点的机器上,``@``替换成要拷贝到的对应节点的账户和ip。 + +## 配置文件信息 + +> #python版本设置 +> export PYTHON_COMMAND=python3.7 +> #训练数据文件路径 +> export TRAIN_DATA_FILE=/home/data/criteo/origin_data/mini_demo.txt + +> 节点信息 +> export RANK_SIZE=8 +> export DEVICE_NUM=8 + +> #need if rank_size > 1 +> export RANK_TABLE_FILE=/home/lcm/tool/rank_table_16p_62_64.json + +> #cluster need for node info +> export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + +说明: +配置文件默认是8卡训练。 +单卡训练时,需要设置RANK_SIZE=1,DEVICE_NUM=1,且不能使用RANK_TABLE_FILE环境变量. +同时还请按需增加指定执行卡序号变量声明export SINGLE_CARD_INDEX。默认 SINGLE_CARD_INDEX=0,可以不显式声明。其它卡时需要显式声明,比如export SINGLE_CARD_INDEX=6 +## FAQ + +如果程序运行遇到错误--OSError: /lib/aarch64-linux-gnu/libgomp.so.1: cannot allocate memory in static TLS block,请执行以下命令可解决问题: +export LD_PRELOAD=$LD_PRELOAD:/usr/local/python3.7.5/lib/python3.7/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..88da82a8af6e340a8447fa4aa494bfcd1a4e13e1 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/patch.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="r1.5"; } + + if [ "$branch_args" == "r1.5" ];then + branch="master" + patch_file_name="r1.5" + commitid="5a4ff4e3dc9bcb46dbb71b6b16fbadbb68c5e8dc" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/recommend/wide_and_deep" + elif [ "$branch_args" == "r1.9" ];then + branch="master" + patch_file_name="r1.9" + commitid="948fe927641651e7a36103b96e96d8e86b4a5255" + git_url="https://gitee.com/mindspore/models.git" + modelzoo_sub_dir="models/official/recommend/wide_and_deep" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/r1.5.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/r1.5.patch new file mode 100644 index 0000000000000000000000000000000000000000..f8e1e8572c2d1db864191a41697c247b12164f75 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/r1.5.patch @@ -0,0 +1,73 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2021-12-10 16:23:44.520000000 +0800 ++++ code/eval.py 2021-12-10 16:23:44.530000000 +0800 +@@ -112,7 +112,14 @@ + + eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) + +- model.eval(ds_eval, callbacks=eval_callback) ++ res = model.eval(ds_eval, callbacks=eval_callback) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ print("ACC_DIR:", ACC_DIR) ++ if not os.path.exists(ACC_DIR): ++ os.mkdir(ACC_DIR) ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(res["auc"])) + + + def modelarts_pre_process(): +diff -Nur origin/train.py code/train.py +--- origin/train.py 2021-12-10 16:23:44.520000000 +0800 ++++ code/train.py 2021-12-10 16:23:44.530000000 +0800 +@@ -13,6 +13,7 @@ + # limitations under the License. + """ test_training """ + import os ++import time + from mindspore import Model, context + from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor + from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel +@@ -20,7 +21,12 @@ + from src.datasets import create_dataset, DataType + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper +- ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + def get_WideDeep_net(configure): + """ +@@ -83,7 +89,28 @@ + ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), + keep_checkpoint_max=5) + ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) ++ model.build(ds_train, None, sink_size=ds_train.get_dataset_size(), epoch=epochs) ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb]) ++ all_data_sum = ds_train.get_dataset_size() * batch_size * epochs ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum/(end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if not os.path.exists(THROUGHPUT_DIR): ++ os.mkdir(THROUGHPUT_DIR) ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + def modelarts_pre_process(): diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/r1.9.patch b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/r1.9.patch new file mode 100644 index 0000000000000000000000000000000000000000..a1a11252925fba522a9b9bd445dc7e0d6df243da --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/r1.9.patch @@ -0,0 +1,78 @@ +diff -Nur origin/eval.py code/eval.py +--- origin/eval.py 2022-09-08 14:41:47.140000000 +0800 ++++ code/eval.py 2022-09-08 14:41:47.156000000 +0800 +@@ -112,7 +112,17 @@ + + eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) + +- model.eval(ds_eval, callbacks=eval_callback) ++ out = model.eval(ds_eval, callbacks=eval_callback) ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ print("ACC_DIR:", ACC_DIR) ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(out['auc'])) + + + def modelarts_pre_process(): +diff -Nur origin/train.py code/train.py +--- origin/train.py 2022-09-08 14:41:47.144000000 +0800 ++++ code/train.py 2022-09-08 14:41:47.156000000 +0800 +@@ -13,6 +13,7 @@ + # limitations under the License. + """ test_training """ + import os ++import time + from mindspore import Model, context + from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor + from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel +@@ -20,6 +21,12 @@ + from src.datasets import create_dataset, DataType + from src.model_utils.config import config + from src.model_utils.moxing_adapter import moxing_wrapper ++try: ++ import ais_utils ++except ImportError: ++ ais_utils_is_existed = False ++else: ++ ais_utils_is_existed = True + + + def get_WideDeep_net(configure): +@@ -83,7 +90,31 @@ + ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), + keep_checkpoint_max=5) + ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) ++ model.build(ds_train, None, sink_size=ds_train.get_dataset_size(), epoch=epochs) ++ ++ if ais_utils_is_existed: ++ start_time = ais_utils.get_datatime() ++ else: ++ start_time = time.time() + model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb]) ++ all_data_sum = epochs * ds_train.get_dataset_size() * batch_size ++ if ais_utils_is_existed: ++ end_time = ais_utils.get_datatime() ++ throughput_rate = ais_utils.calc_throughput_rate(all_data_sum, start_time, end_time) ++ else: ++ end_time = time.time() ++ throughput_rate = all_data_sum / (end_time - start_time) ++ ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) + + + def modelarts_pre_process(): diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2bdca78d3eaa658e65fe15ee31a11fab400c2e16 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/cluster_offline_run.sh @@ -0,0 +1,78 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_FILE?TRAIN_DATA_FILE not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..6d2e209e3108a106a5d7955ec41a9adb44c589bc --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_mindspore_widedeep/scripts/run_node.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +function get_data_preprocess_cmd(){ + LINE_COUNT=`cat ${TRAIN_DATA_FILE} | wc -l` + data_preprocess_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/src/preprocess_data.py \ + --data_path=$WORK_PATH/data/ \ + --dense_dim=13 --slot_dim=26 --threshold=100 --train_line_count=$LINE_COUNT --skip_id_convert=0 + " + return 0 +} + +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="True" || DISTRUTE_ENABLE="False" + + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py --data_path=$DATASET_PATH --dataset_type=mindrecord" + return 0 +} + +function get_eval_cmd() +{ + DATASET_PATH=$WORK_PATH/data/mindrecord + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --data_path=$DATASET_PATH --dataset_type=mindrecord --ckpt_path=$CHECKPOINT_PATH" + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/mindspore_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # ͨü Ҫ PYTHON_COMMAND RANK_SIZERANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # ǷװӦ + check_mindspore_run_ok ${PYTHON_COMMAND} || { logger_Warn "mindspore running failed" ; return 1; } + logger_Debug "mindspore running successfully" + + check_file_valid "${TRAIN_DATA_FILE}" || { logger_Warn "TRAIN_DATA_FILE:${TRAIN_DATA_FILE} not valid file" ; return 1; } + logger_Debug "TRAIN_DATA_FILE is valid" +} + +function node_data_preprocess() +{ + # data + rm -rf $WORK_PATH/data/* + mkdir -p $WORK_PATH/data/origin_data + ln -sf ${TRAIN_DATA_FILE} $WORK_PATH/data/origin_data/train.txt + + get_data_preprocess_cmd + $data_preprocess_cmd || { logger_Warn "preprocess run failed"; return 1; } + + DATASET_PATH=$WORK_PATH/data/mindrecord + check_path_valid $DATASET_PATH || { logger_Warn "mindpath:${DATASET_PATH} not valid path" ; return 1; } +} + +function node_train() +{ + node_data_preprocess + + # ͨѵӿ + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + CHECKPOINT_PATH=`find ${WORK_PATH}/train_parallel$RANK_ID/ -name "*.ckpt" | xargs ls -t | awk 'NR==1{print}'` + [ -f $CHECKPOINT_PATH ] || { logger_Warn "CHECKPOINT_PATH:${CHECKPOINT_PATH} not valid path" ; return 1; } + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + cd $RUN_PATH + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd || { echo "run eval node error ret:$?"; return 1; } + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..2c928242e78d93cb39bf9c91aa2bd32690b63e2f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/config/config.sh @@ -0,0 +1,12 @@ +export PYTHON_COMMAND=python3.7.5 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/bertData/cn-wiki-128/ +export EVAL_DATA_PATH=/home/datasets/bertData/cn-wiki-128/ + +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea18ddd072526df7e6c9934f94c1a0cf5e891916 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/patch.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow" + + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a30145bd2eb66e51377bf49fad63e3a3e830f0e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + #run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..498ba4ca09b97d88ffc75a3148bd357276dda88a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/cluster_offline_run.sh @@ -0,0 +1,79 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + : "${EVAL_DATA_PATH?EVAL_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..95ad39a92921709e05de3022b06aa188e8c89ebc --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_bert_base/scripts/run_node.sh @@ -0,0 +1,96 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="True" || DISTRUTE_ENABLE="False" + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/src/pretrain/run_pretraining.py \ + --bert_config_file=$WORK_PATH/code/configs/bert_base_config.json \ + --max_seq_length=128 \ + --max_predictions_per_seq=20 \ + --train_batch_size=128 \ + --learning_rate=6.25e-6 \ + --num_warmup_steps=100 \ + --num_train_steps=500 \ + --optimizer_type=adam \ + --manual_fp16=True \ + --use_fp16_cls=True \ + --input_files_dir=${TRAIN_DATA_PATH} \ + --eval_files_dir=$EVAL_DATA_PATH \ + --npu_bert_debug=False \ + --npu_bert_use_tdt=True \ + --do_train=True \ + --num_accumulation_steps=1 \ + --npu_bert_job_start_file= \ + --iterations_per_loop=100 \ + --save_checkpoints_steps=10000 \ + --npu_bert_clip_by_global_norm=False \ + --distributed=$DISTRUTE_ENABLE \ + --npu_bert_loss_scale=0 \ + --output_dir=$RUN_PATH/ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..0f62ce606d2b5050947557d635b3c06366dda903 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/config/config.sh @@ -0,0 +1,11 @@ +export PYTHON_COMMAND=python3.7 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/imagenet_TF +export MAX_TRAIN_STEPS=200 +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..a32ac5dd9cca9438bffa6fc010419c2e6b60a65a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/image_classification/DenseNet121_ID0067_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..87ed5ec229d6ce77dd958512a161df90bb27f982 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/cluster_offline_run.sh @@ -0,0 +1,79 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${MAX_TRAIN_STEPS?MAX_TRAIN_STEPS not set}" + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..6e9c9de0f87bd3a1c9a3b454b2768ac03c2c0c3b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_densenet121/scripts/run_node.sh @@ -0,0 +1,78 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py \ + --data_dir=${TRAIN_DATA_PATH} \ + --rank_size=$RANK_SIZE \ + --iterations_per_loop=10 \ + --mode=train \ + --max_train_steps=$MAX_TRAIN_STEPS \ + --lr=0.04 \ + --display_every=10 \ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..9293b1199d8a065cf1198cb0395d8a35675a7799 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/config/config.sh @@ -0,0 +1,14 @@ +export PYTHON_COMMAND=python3.7.5 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/imagenet_TF/ +export MAX_TRAIN_STEPS=500 +export BATCH_SIZE=256 + + +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9e49c508dc206c9e9944f44e2e5cde27a34a5f7 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/image_classification/MobileNetV2_ID0074_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..b9931156db6e8f9b39c3b34b305764ff5e981769 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${MAX_TRAIN_STEPS?MAX_TRAIN_STEPS not set}" + : "${BATCH_SIZE?BATCH_SIZE not set}" + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..e12a2d3272b7e000aaa25ebb036426245cc95db9 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_mobilenetv2/scripts/run_node.sh @@ -0,0 +1,86 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py \ + --dataset_dir=$TRAIN_DATA_PATH \ + --max_train_steps=$MAX_TRAIN_STEPS \ + --iterations_per_loop=10 \ + --model_name=\"mobilenet_v2\" \ + --moving_average_decay=0.9999 \ + --label_smoothing=0.1 \ + --preprocessing_name=\"inception_v2\" \ + --weight_decay='0.000004' \ + --batch_size=$BATCH_SIZE \ + --learning_rate_decay_type='cosine_annealing' \ + --learning_rate=0.4 \ + --optimizer='momentum' \ + --momentum='0.9' \ + --warmup_epochs=5 \ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..410eb5246f74030963be8828811bb49f40a0abb2 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/config/config.sh @@ -0,0 +1,12 @@ +export PYTHON_COMMAND=python3.7.5 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/Bert-small-Dataset/ +export EVAL_DATA_PATH=/home/datasets/Bert-TestData/ + +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/master.patch b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/master.patch new file mode 100644 index 0000000000000000000000000000000000000000..99e7cd5f19345ce262ebb8bbf5faccc496168345 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/master.patch @@ -0,0 +1,28 @@ +diff -Nur origin/src/pretrain/run_pretraining.py code/src/pretrain/run_pretraining.py +--- origin/src/pretrain/run_pretraining.py 2021-12-01 14:52:58.809717535 +0800 ++++ code/src/pretrain/run_pretraining.py 2021-12-01 14:52:58.817717584 +0800 +@@ -237,6 +237,24 @@ + else: + print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' % + (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True) ++ ++ try: ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("ais-bench Warning: 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("ais-bench Warning: 'RESULT_PATH' is not a valid directory. ") ++ else: ++ if rank_id == 0: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ throughput = sent_per_sec ++ print("ais-bench {} set throughput:{}".format(rank_id, throughput)) ++ f.write("{}".format(throughput)) ++ except Exception as msg: ++ print("write throught failed {}".format(msg)) ++ + self.elapsed_secs = 0. + self.count = 0 + self.avg_loss = 0.0 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..7a11c1c87568e903cf29f3e6cb505d783cbd75df --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/nlp/Nezha-large_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a30145bd2eb66e51377bf49fad63e3a3e830f0e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + #run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..498ba4ca09b97d88ffc75a3148bd357276dda88a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/cluster_offline_run.sh @@ -0,0 +1,79 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + : "${EVAL_DATA_PATH?EVAL_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..0ef0cb4864fbcf46690bc5adda055c38a0104236 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_nezha_large/scripts/run_node.sh @@ -0,0 +1,99 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && DISTRUTE_ENABLE="True" || DISTRUTE_ENABLE="False" + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/src/pretrain/run_pretraining.py \ + --bert_config_file=$WORK_PATH/code/configs/nezha_large_config.json \ + --max_seq_length=128 \ + --max_predictions_per_seq=20 \ + --train_batch_size=64 \ + --learning_rate=1e-4 \ + --num_warmup_steps=100 \ + --num_train_steps=1000 \ + --optimizer_type=lamb \ + --manual_fp16=True \ + --use_fp16_cls=True \ + --input_files_dir=${TRAIN_DATA_PATH} \ + --eval_files_dir=$EVAL_DATA_PATH \ + --npu_bert_debug=False \ + --npu_bert_use_tdt=True \ + --do_train=True \ + --num_accumulation_steps=1 \ + --npu_bert_job_start_file= \ + --iterations_per_loop=100 \ + --save_checkpoints_steps=1000 \ + --npu_bert_clip_by_global_norm=False \ + --distributed=$DISTRUTE_ENABLE \ + --npu_bert_loss_scale=0 \ + --output_dir=$RUN_PATH/ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + + check_path_valid "${EVAL_DATA_PATH}" || { logger_Warn "EVAL_DATA_PATH:${EVAL_DATA_PATH} not valid path" ; return 1; } + logger_Debug "EVAL_DATA_PATH is valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..6b092948fba2901fbe68545a1178d93219e86885 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/config/config.sh @@ -0,0 +1,13 @@ +export PYTHON_COMMAND=python3.7.5 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/imagenet_TF +export RESNET_SIZE=101 +export EPOCH_SIZE=1 +export BATCH_SIZE=90 +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..f65755a69ea6de3346eddcd512b492018f8c847d --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/image_classification/ResNet101_ID0063_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a30145bd2eb66e51377bf49fad63e3a3e830f0e --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + #run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..e27eaec81baecc96a9535c7c6385287f85eb25b7 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/cluster_offline_run.sh @@ -0,0 +1,82 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${EPOCH_SIZE?EPOCH_SIZE not set}" + : "${RESNET_SIZE?RESNET_SIZE not set}" + : "${BATCH_SIZE?BATCH_SIZE not set}" + : "${RANK_SIZE?RANK_SIZE not set}" + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..71e55c5bbf5d4d2d0306c0b0dee75bdc70cb0370 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet101/scripts/run_node.sh @@ -0,0 +1,84 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/r1/resnet/imagenet_main.py \ + --resnet_size=$RESNET_SIZE \ + --batch_size=$BATCH_SIZE \ + --num_gpus=1 \ + --dtype=fp16 \ + --label_smoothing=0.1 \ + --loss_scale=512 \ + --max_train_steps=2000 \ + --train_epochs=$EPOCH_SIZE \ + --eval_only=False \ + --epochs_between_evals=10 \ + --hooks=ExamplesPerSecondHook,loggingtensorhook,loggingmetrichook \ + --data_dir=${TRAIN_DATA_PATH} \ + --model_dir=$RUN_PATH \ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "false" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/README.txt b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..443ba899986719231d25a77d3abe363d9442976b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/README.txt @@ -0,0 +1,7 @@ +resnet50模型代码基准来源: + git_url="https://gitee.com/ascend/modelzoo.git" + modelzoo_sub_dir="modelzoo/built-in/TensorFlow/Official/cv/image_classification/Resnet50v1.5_for_TensorFlow/" + + +origin: 原始目录 +code: 修改目录 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..b9673b9745b103588ebaed633004eb4e45070785 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/config/config.sh @@ -0,0 +1,15 @@ +export PYTHON_COMMAND=python3.7.5 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/imagenet_TF +export RESNET_SIZE=50 +export EPOCH_SIZE=2 +export BATCH_SIZE=128 + + +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.old.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git "a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/doc/ais-bench+tensorflow-resnet\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/doc/ais-bench+tensorflow-resnet\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000000000000000000000000000000000..d9bf811de5fc831a946f0e464bbad845d486d368 --- /dev/null +++ "b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/doc/ais-bench+tensorflow-resnet\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,71 @@ +# Ais-Bench+Mindspore+Resnet使用说明 + +## 简介 + +AI Server Benchmark 是按《信息技术 人工智能 服务器系统性能测试规范》对人工智能服务器系统的性能进行性能评估的测试系统(测试套件),简称Ais-Bench软件。 + +## 使用前提 + +本程序包运行需要基于以下前提 + +1. Atlas 800-9000设备 +2. 安装好CANN包和Mindspore对应版本。并可以运行正常mindspore测试程序。 +3. 保存数据集和相关预处理文件等到设备中。 + +## 集群节点配置 + +如果运行设备大于1个设备,那么需要运行设置ssh节点文件。说明节点信息 +{ +"cluster": { +"xx.xx.xx.xx": { # 节点ip 必须与ranktable中对应 +"user": "xxxx", # 用户名 免密可以不用设置 +"pd": "xx", # 密码 免密不用设置 +"port": xx # 端口 默认22 可以不用设置 +}, +"xx.xx.xx.xx": { +"user": "xxxx", +"pd": "xx", +"port": xx +} +} +} + +## 集群节点免密设置 + +* 设置密钥认证的参考操作如下: + ssh-keygen -t rsa -b 2048 # 登录管理节点并生成SSH Key。安全起见,建议用户到"Enter passphrase"步骤时输入密钥密码,且符合密码复杂度要求。建议执行这条命令前先将umask设置为0077,执行完后再恢复原来umask值。 + +* ssh-copy-id -i ~/.ssh/id_rsa.pub ``@`` # 将管理节点的公钥拷贝到所有节点的机器上,``@``替换成要拷贝到的对应节点的账户和ip。 + +* 设置ssh代理管理ssh密钥,避免工具批量安装操作过程中输入密钥密码 + + ssh-agent bash # 开启ssh-agent的bash进程 + + ssh-add # 向ssh-agent添加私钥 + + +## 配置文件信息 + +> #python版本设置 +> export PYTHON_COMMAND=python3.7 +> #训练数据集路径 +> export TRAIN_DATA_PATH=/home/datasets/imagenet/train/ +> #验证数据集路径 +> export EVAL_DATA_PATH=/home/datasets/imagenet/val/ +> #epoch数 +> export EPOCH_SIZE=90 + +> 节点信息 +> export RANK_SIZE=8 +> export DEVICE_NUM=8 + +> #need if rank_size > 1 +> export RANK_TABLE_FILE=/home/lcm/tool/rank_table_16p_62_64.json + +> #cluster need for node info +> export NODEINFO_FILE=/home/lcm/tool/ssh64_66.json + +说明: +配置文件默认是8卡训练。 +单卡训练时,需要设置RANK_SIZE=1,DEVICE_NUM=1,且不能使用RANK_TABLE_FILE环境变量. +同时还请按需增加指定执行卡序号变量声明export SINGLE_CARD_INDEX。默认 SINGLE_CARD_INDEX=0,可以不显式声明。其它卡时需要显式声明,比如export SINGLE_CARD_INDEX=6 diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/master.patch b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/master.patch new file mode 100644 index 0000000000000000000000000000000000000000..acda7397643c7f9490d2c7f51cc3d0fac5564c0b --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/master.patch @@ -0,0 +1,48 @@ +diff -Nur origin/official/r1/resnet/imagenet_main.py code/official/r1/resnet/imagenet_main.py +--- origin/official/r1/resnet/imagenet_main.py 2021-12-01 14:49:02.440337230 +0800 ++++ code/official/r1/resnet/imagenet_main.py 2021-12-01 14:49:02.556337859 +0800 +@@ -405,6 +405,21 @@ + flags_obj, imagenet_model_fn, input_function, DATASET_NAME,NUM_IMAGES, + shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS],) + ++ try: ++ if 'accuracy' in result.get("eval_results",None): ++ ACC_DIR = os.getenv("RESULT_PATH") ++ if ACC_DIR is None: ++ print("ais-bench Warning: 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(ACC_DIR): ++ print("ais-bench Warning: 'RESULT_PATH' is not a valid directory. ") ++ else: ++ ACC_LOG = os.path.join(ACC_DIR, "eval_acc.log") ++ with open(ACC_LOG, 'w') as f: ++ f.write("{}".format(result["eval_results"]['accuracy'])) ++ print("ais-bench set accuracy:{}".format(result["eval_results"]['accuracy'])) ++ except Exception as msg: ++ print("ais-bench accuracy set failed {}".format(msg)) ++ + return result + def main(_): + ############## npu modify begin ############# +diff -Nur origin/official/utils/logs/hooks.py code/official/utils/logs/hooks.py +--- origin/official/utils/logs/hooks.py 2021-12-01 14:49:02.444337252 +0800 ++++ code/official/utils/logs/hooks.py 2021-12-01 14:49:02.560337881 +0800 +@@ -159,3 +159,19 @@ + "steps: %s,elapsed_steps:%d,batch:%d,FPS:%f,ips:%f,batch_time:%f", int(self._total_steps), + int(elapsed_steps),int(self._batch_size),float(current_examples_per_sec),float(ips), + float(batch_time)) ++ try: ++ rank_id = int(os.getenv('DEVICE_INDEX')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("ais-bench Warning: 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("ais-bench Warning: 'RESULT_PATH' is not a valid directory. ") ++ else: ++ if rank_id == 0: ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ throughput = current_examples_per_sec ++ print("ais-bench {} set throughput:{}".format(rank_id, throughput)) ++ f.write("{}".format(throughput)) ++ except Exception as msg: ++ print("write throught failed {}".format(msg)) diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..efc23b5e3ad06db40f6823e480536acd1d80bcfb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/image_classification/Resnet50v1.5_ID1721_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1f687a9cf0b39db84cf659070db78195891725f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${EPOCH_SIZE?EPOCH_SIZE not set}" + : "${RESNET_SIZE?RESNET_SIZE not set}" + : "${BATCH_SIZE?BATCH_SIZE not set}" + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..0750cab1e8b14263fc081cbfa8e315dfae56d8a4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnet50/scripts/run_node.sh @@ -0,0 +1,74 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/official/r1/resnet/imagenet_main.py \ + --resnet_size=$RESNET_SIZE --resnet_version=1 \ + --epochs_between_evals=$EPOCH_SIZE --hooks=ExamplesPerSecondHook \ + --train_epochs=$EPOCH_SIZE \ + --batch_size=$BATCH_SIZE --data_dir=${TRAIN_DATA_PATH} --model_dir=$RUN_PATH/" + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..a379b0017ef79b4998b1bba7e18b4cc7ac6d1f48 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/config/config.sh @@ -0,0 +1,11 @@ +export PYTHON_COMMAND=python3.7 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/imagenet_TF +export MAX_TRAIN_STEPS=1000 +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..8e68d15738b8a9505354ac2a28b1e443122eb03f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/image_classification/ResNext50_ID0070_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..35df41fe3875b6ddf724391db56daeb3f0e85cb9 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/cluster_offline_run.sh @@ -0,0 +1,79 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${MAX_TRAIN_STEPS?MAX_TRAIN_STEPS not set}" + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..c0191f9ca1ee1aba7456c4b7cbdeed2720e60661 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_resnext50/scripts/run_node.sh @@ -0,0 +1,87 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + [[ $RANK_SIZE -eq 1 ]] && CONFIG_FILE_32bs_1p_host || CONFIG_FILE=res50_32bs_8p_host + ln -sf $WORK_PATH/code/code/resnext50_train/configs ../ + sed -i "s|'num_epochs':.* |'num_epochs': 3, |g" $WORK_PATH/code/code/resnext50_train/configs/$CONFIG_FILE.py + sed -i "s|'rank_size':.* |'rank_size': $RANK_SIZE, |g" $WORK_PATH/code/code/resnext50_train/configs/$CONFIG_FILE.py + mkdir -p /data/ + ln -sf $TRAIN_DATA_PATH /data/imagenet_TF + + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/code/resnext50_train/mains/res50.py \ + --config_file=$CONFIG_FILE \ + --max_train_steps=$MAX_TRAIN_STEPS \ + --iterations_per_loop=100 \ + --debug=True \ + --eval=False \ + --over_dump=False \ + --data_path=$TRAIN_DATA_PATH \ + --model_dir=$RUN_PATH \ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code/code/resnext50_train + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..8220cddf72bb27fbb61bb4e347f79880540378d0 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/config/config.sh @@ -0,0 +1,16 @@ +export PYTHON_COMMAND=python3.7 + +# 参数信息 +export MODE=train +export BATCH_SIZE=32 +export TRAINING_FILE_PATTERN=/home/datasets/raw_data_tfrecord/train2017* +export RESNET_CHECKPOINT=/home/datasets/raw_data/resneet32_pretrain_model/model.ckpt-28152 +export VALIDATION_FILE_PATTERN=/home/datasets/raw_data/tfrecord/val2017* +export VAL_JSON_FILE=/home/datasets/raw_data/annotations/instances_val2017.json + +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=1 +export DEVICE_NUM=1 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..f6b87bb2c0096190dcafd5fe515d6fc23b6f6ed6 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..cf8c92a82674fab12697a7e6b02b63b61ae3adb4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/cluster_offline_run.sh @@ -0,0 +1,83 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${MODE?MODE not set}" + : "${BATCH_SIZE?BATCH_SIZE not set}" + : "${TRAINING_FILE_PATTERN?TRAINING_FILE_PATTERN not set}" + : "${RESNET_CHECKPOINT?RESNET_CHECKPOINT not set}" + : "${VALIDATION_FILE_PATTERN?VALIDATION_FILE_PATTERN not set}" + : "${VAL_JSON_FILE?VAL_JSON_FILE not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..411c1f15bd50d22cbeb5d4b3cbf42ec2e0522ac5 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_ssd_resnet34/scripts/run_node.sh @@ -0,0 +1,78 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + [[ $RANK_SIZE -gt 1 ]] && NUM_EPOCHS=8 || NUM_EPOCHS=1 + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/ssd_main.py \ + --mode=$MODE \ + --train_batch_size=$BATCH_SIZE \ + --training_file_pattern=$TRAINING_FILE_PATTERN \ + --resnet_checkpoint=$RESNET_CHECKPOINT \ + --validation_file_pattern=$VALIDATION_FILE_PATTERN \ + --val_json_file=$VAL_JSON_FILE \ + --num_epochs=$NUM_EPOCHS \ + --num_examples_per_epoch=64000 \ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..0d87cb416fd7492389193a62d260ce1ae98a2ae3 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/config/config.sh @@ -0,0 +1,13 @@ +export PYTHON_COMMAND=python3.7 + +# 参数信息 +export TRAIN_DATA_PATH=/home/datasets/imagenet_TF +export BATCH_SIZE=32 +export MODE=train +export MAX_TRAIN_STEPS=1000 +export ITERATIONS_PER_LOOP=10 +# 网络信息 + +export RANK_SIZE=1 +export DEVICE_NUM=1 + diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..acf00f452373ae949ee6d3fc5b3991803b7c8a93 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/image_classification/VGG16_ID0068_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..444493bd64f3f49cded3174fbc98ee4e86717500 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/cluster_offline_run.sh @@ -0,0 +1,80 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${BATCH_SIZE?BATCH_SIZE not set}" + : "${MAX_TRAIN_STEPS?MAX_TRAIN_STEPS not set}" + : "${TRAIN_DATA_PATH?TRAIN_DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..4ac01712dfe43f91b520fb7ad408172fdeb2f683 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_vgg16/scripts/run_node.sh @@ -0,0 +1,83 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + [[ $RUNK_SIZE -ne 8 ]] && DISPLAY_EVERY=10 || DISPLAY_EVERY=1 + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py \ + --batch_size=$BATCH_SIZE \ + --rank_size=$RANK_SIZE \ + --mode=$MODE \ + --max_train_steps=$MAX_TRAIN_STEPS \ + --iterations_per_loop=$ITERATIONS_PER_LOOP \ + --epochs_between_evals=1 \ + --data_dir=$TRAIN_DATA_PATH \ + --max_epochs=1 \ + --display_ever=$DISPLAY_EVERY \ + --lr=0.01 \ + " + return 0 +} + +function get_eval_cmd() +{ + return 0 +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${TRAIN_DATA_PATH}" || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return 1; } + logger_Debug "TRAIN_DATA_PATH is valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/build.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..3eb8977d09fcdb9059b765d3d2961d6612e88610 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +file_change() +{ + local run_type=$1 + [ "$run_type" == "modelarts" ] && { sed -i 's|RUNTYPE=.*|RUNTYPE="modelarts"|g' ${CURDIR}//output/benchmark.sh; } + return 0 +} + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp -rf ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ + cp ${CURDIR}/../common/* -r ${CURDIR}//output/config/ + cp ${CURDIR}/doc -r ${CURDIR}/output/ + file_change "$2" || { echo "file change failed"; return 1; } +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/config/config.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/config/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..f7b0ae09c54b8c0a111addb32d8e307dcc667dc4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/config/config.sh @@ -0,0 +1,11 @@ +export PYTHON_COMMAND=python3.7 + +# 参数信息 +export DATA_PATH=/home/datasets/yolov3_tf2/data +export MODE="multi" +# 网络信息 +export RANK_TABLE_FILE=/home/tools/rank_table_8p_62.json +export RANK_SIZE=8 +export DEVICE_NUM=8 + +#export NODEINFO_FILE=/home/tools/2node_6264.json diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/master.patch b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/master.patch new file mode 100644 index 0000000000000000000000000000000000000000..dc26fe07fdb1db8f2c0b860f331a1644b65572d4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/master.patch @@ -0,0 +1,43 @@ +diff -Nur origin/train.py code/train.py +--- origin/train.py 2021-12-17 19:18:34.127267733 +0800 ++++ code/train.py 2021-12-17 19:18:34.131267604 +0800 +@@ -309,6 +309,8 @@ + best_mAP = -np.Inf + train_op = util.set_iteration_per_loop(sess, train_op, args.iterations_per_loop) + sess.run(train_init_op) ++ fps_sum = 0 ++ fps_num = 0 + for epoch in range(args.total_epoches): + loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter() + for i in trange(args.train_batch_num // args.iterations_per_loop): +@@ -317,7 +319,8 @@ + [train_op, merged, y_true, loss, global_step, learning_rate] + ) + fps = 1 / (time.time() - t) * args.iterations_per_loop * args.num_gpus * args.batch_size +- ++ fps_num += 1 ++ fps_sum += fps + writer.add_summary(summary, global_step=__global_step) + + loss_total.update(__loss[0], len(__y_true[0])) +@@ -351,6 +354,20 @@ + if __global_step >= 500: + break + ++ # output throughtout ++ throughput_rate = fps_sum/fps_num ++ rank_id = int(os.getenv('RANK_ID')) ++ THROUGHPUT_DIR = os.getenv("RESULT_PATH") ++ if THROUGHPUT_DIR is None: ++ print("Warning: The environment variable 'RESULT_PATH' is not set. ") ++ elif not os.path.isdir(THROUGHPUT_DIR): ++ print("Warning: The environment variable 'RESULT_PATH' is not a valid directory. ") ++ else: ++ print("THROUGHPUT_DIR:", THROUGHPUT_DIR) ++ THROUGHPUT_LOG = os.path.join(THROUGHPUT_DIR, "throughput_rank_{}".format(rank_id)) ++ with open(THROUGHPUT_LOG, 'w') as f: ++ f.write("{}".format(throughput_rate)) ++ + saver_to_save.save(sess, args.save_dir + 'model-final_step_{}_loss_{:.4f}_lr_{:.5g}'.format( \ + int(__global_step), + loss_total.average, diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/patch.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..3b23d7006ec0218200deef3b935b7b7ac3a05e5d --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/patch.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +SRC_PATH=$CUR_PATH/../../../ +. $SRC_PATH/common/patch_common.sh + +get_git_info(){ + local branch_args="$1" + local run_type="$2" + + # set default branch + [[ -z "$branch_args" ]] && { branch_args="master"; } + + if [ "$branch_args" == "master" ];then + branch="master" + patch_file_name="master" + commitid="ca293a6071e44f6286e9ef3c1415c9818c1dd7af" + git_url="https://gitee.com/ascend/ModelZoo-TensorFlow.git" + modelzoo_sub_dir="ModelZoo-TensorFlow/TensorFlow/built-in/cv/detection/YoloV3_ID0076_for_TensorFlow" + else + echo "bad parameters : $1" + return $ret_error + fi + + [ "$run_type" == "modelarts" ] && { patch_file_name="modelarts_"$patch_file_name; } + return $ret_ok +} + +main(){ + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ];then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + local patch_type="$1" + local branch_args="$2" + local run_type="$3" + + get_git_info "$branch_args" "$run_type" || { echo "warn get git info failed"; return $ret_error; } + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$patch_type" == "mkpatch" ];then + make_patch || { echo "warn make patch failed"; return $ret_error; } + elif [ "$patch_type" == "loadcode" ];then + load_code || { echo "warn make patch failed"; return $ret_error; } + mkdir -p $CUR_PATH/doc + mk_version_file $CUR_PATH/doc/version.txt + else + echo "null op" + return $ret_error + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..766ba210a8f9a1aa660b3ab41a475b128c5e8b31 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/benchmark.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_init_failed=1 +declare -i ret_run_train_failed=2 +declare -i ret_run_eval_failed=3 +declare -i ret_get_result_failed=4 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export CODE_PATH=$CUR_PATH +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/common.sh + +RUNTYPE="cluster_offline" + +[ $RUNTYPE == "modelarts" ] && . $CODE_PATH/modelarts_run.sh +[ $RUNTYPE == "cluster_offline" ] && . $CODE_PATH/cluster_offline_run.sh + +main(){ + init || { logger_Warn "init failed:$?";return $ret_init_failed; } + run_train || { logger_Warn "run_train failed ret:$?";return $ret_run_train_failed; } + run_eval || { logger_Warn "run_eval failed ret:$?";return $ret_run_eval_failed; } + get_result || { logger_Warn "get_result failed ret:$?";return $ret_get_result_failed; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/cluster_offline_run.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/cluster_offline_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea3a88de0d9213fb126a8c8f6e42908f97f8038d --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/cluster_offline_run.sh @@ -0,0 +1,78 @@ +#!/bin/bash +. $CODE_PATH/common/common.sh +. $CODE_PATH/common/log_util.sh +. $CODE_PATH/common/cluster_common.sh +. $CODE_PATH/common/node_common.sh + +# env check +check_env() +{ + # base check + check_env_common || { logger_Warn "check env common failed:$?";return 1; } + + # model info check + : "${DATA_PATH?DATA_PATH not set}" + + # check env of each node + cmd="export WORK_PATH=$WORK_PATH; + bash -x $WORK_PATH/run_node.sh check ${WORK_PATH}/config/$CONFIG_FILE" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { return 1; } +} + +init() +{ + logger_Info "-------------------------------- init start --------------------------------" + # set nodes work path + export WORK_PATH=${BASE_PATH}/work + # set nodes result path + export RESULT_PATH=${WORK_PATH}/result + export PYTHONPATH=$PYTHONPATH:$CODE_PATH + CONFIG_FILE="config.sh" + source ${CODE_PATH}/config/$CONFIG_FILE || { logger_Warn "source file failed:$?";return 1; } + + rm -rf $RESULT_PATH;mkdir -p $RESULT_PATH + # sync data if work_path not exist so new one + + cmd="rm -rf ${WORK_PATH};mkdir -p ${WORK_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "renew workpath failed"; return 1; } + + cluster_scp "${NODEINFO_FILE}" ${CODE_PATH} ${WORK_PATH} || { logger_Warn "run scp failed"; return 1; } + + check_env || { logger_Warn "env check failed'" ; return 1; } + logger_Info "-------------------------------- init end --------------------------------" +} + +run_train() +{ + logger_Info "-------------------------------- train start --------------------------------" + cmd="export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + source $WORK_PATH/config/$CONFIG_FILE; + bash -x $WORK_PATH/run_node.sh train" + + cluster_run_cmd_parallel "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run train failed"; return 1; } + logger_Info "-------------------------------- train end --------------------------------" +} + +run_eval() +{ + logger_Info "-------------------------------- eval start --------------------------------" + cmd="source $WORK_PATH/config/$CONFIG_FILE; + export WORK_PATH=$WORK_PATH; + export RESULT_PATH=$RESULT_PATH; + bash $WORK_PATH/run_node.sh eval" + cluster_run_cmd_single "${NODEINFO_FILE}" ${cmd} || { logger_Warn "run eval failed"; return 1; } + logger_Info "-------------------------------- eval end --------------------------------" +} + +get_result() +{ + logger_Info "-------------------------------- get_result start --------------------------------" + + cmd="mkdir -p ${RESULT_PATH}" + cluster_run_cmd_serial "$NODEINFO_FILE" ${cmd} || { logger_Warn "mkdir resultpath failed"; return 1; } + + cluster_rscp "${NODEINFO_FILE}" ${RESULT_PATH} ${RESULT_PATH} + ${PYTHON_COMMAND} ${CODE_PATH}/common/calc_result.py ${RESULT_PATH} ${RANK_SIZE} + logger_Info "-------------------------------- get_result end --------------------------------" +} diff --git a/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/run_node.sh b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/run_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..3112b4d04a57bad4a5c0277997f986ad76a826bb --- /dev/null +++ b/huawei/ais-bench_workload/src/train/huawei/train_tensorflow_yolov3/scripts/run_node.sh @@ -0,0 +1,105 @@ +#!/bin/bash +. $WORK_PATH/common/common.sh +. $WORK_PATH/common/log_util.sh +. $WORK_PATH/common/node_common.sh + +# 获取训练命令 +function get_train_cmd() +{ + rm -rf $WORK_PATH/code/data + ln -sf $DATA_PATH $RUN_PATH + ln -sf $DATA_PATH $WORK_PATH/code + train_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/train.py \ + --mode=$MODE \ + " + return 0 +} + +function get_eval_cmd() +{ + rm -rf $WORK_PATH/code/data + ln -sf $DATA_PATH $RUN_PATH + ln -sf $DATA_PATH $WORK_PATH/code + cp $WORK_PATH/code/eval_coco.py $RUN_PATH/ + eval_run_cmd="${PYTHON_COMMAND} -u $WORK_PATH/code/eval.py \ + --save_json=True \ + --score_thresh=0.0001 \ + --nms_thresh=0.55 \ + --max_boxes=100 \ + --restore_path=$RESTORE_PATH \ + --max_test=10000 \ + --save_json_path=eval_res_D$RANK_ID.json \ + " + return +} + +function node_init() +{ + export PYTHONPATH=$PYTHONPATH:$WORK_PATH:$WORK_PATH/code + source $WORK_PATH/config/tensorflow_env.sh + # for eval env set + [ $1 == "eval" ] && { export RANK_SIZE=1; export DEVICE_ID=0; : "${SINGLE_CARD_INDEX:=0}";export RANK_ID=$SINGLE_CARD_INDEX; unset RANK_TABLE_FILE; } + [[ -z "$RESULT_PATH" ]] || { mkdir -p $RESULT_PATH; } +} + +function node_check() +{ + CONFIG_FILE_PATH=$1 + source $CONFIG_FILE_PATH + + # 通用检测 主要检测 PYTHON_COMMAND RANK_SIZE和RANK_TABLE + node_common_check "${PYTHON_COMMAND}" "${RANK_SIZE}" "$RANK_TABLE_FILE" || { logger_Warn "node common check failed" ; return 1; } + # 检测是否安装对应框架软件 + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" + + check_path_valid "${DATA_PATH}" || { logger_Warn "DATA_PATH:${DATA_PATH} not valid path" ; return 1; } + logger_Debug "DATA_PATH is valid" + + logger_Debug "EVAL_DATA_PATH is valid" + +} + +function node_train() +{ + # 调用通用训练接口 + node_common_train "true" "false" || { logger_Warn "run train failed" ; return 1; } +} + +function node_eval() +{ + RUN_PATH=$WORK_PATH/train_parallel$RANK_ID + RESTORE_PATH=$RUN_PATH/training/ + cd $RUN_PATH + + get_eval_cmd + echo "start eval RUN_PATH:${RUN_PATH} SERVER_ID:$SERVER_ID rank $RANK_ID device $DEVICE_ID begin cmd:${eval_run_cmd}" + $eval_run_cmd > $RUN_PATH/eval.log || { echo "run eval node error ret:$?"; return 1; } + + if [ -f "$RUN_PATH/eval.log" ];then + accuracy=`cat $PATH/eval.log |grep "Average Precision" |grep -v grep |awk -F= 'NR==1{print $NF}'` + logger_Info "accuracy: $accuracy" + echo "$accuracy" > $RESULT_PATH/eval_acc.log + fi + + return 0 +} + +main() +{ + type="$1" + shift + node_init $type || { logger_Warn "init failed"; return 1; } + if [ "$type" == "train" ];then + node_train "$@" || { logger_Warn "run_node_train failed"; return 1; } + elif [ "$type" == "eval" ];then + node_eval "$@" || { logger_Warn "run_node_eval failed"; return 1; } + elif [ "$type" == "check" ];then + node_check "$@" || { logger_Warn "run_node_check failed"; return 1; } + else + { logger_Warn "invalid argument '${type}'"; return 1; } + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/README.MD b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..4e0972d6ef238b130fff7bee0c8c5c49a7fda4ab --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/README.MD @@ -0,0 +1,80 @@ +## 1. 背景 + +### 1.1 bert模型基准来源: + +https://github.com/google-research/bert/tree/master + +### 1.2 训练指标计算说明 + +- accuracy 训练结果评估中masked_lm_accuracy的值 +- throughput_ratio 计算公式--train_batch_size * num_train_steps / (end - real_step_start_time)。 + 公式说明: + + train_batch_size 训练批数 + + num_train_steps 训练步数 + + real_step_start_time 训练开始时间。该时间是单纯训练时间,不包括训练数据加载内存、加载device、训练热身等辅助时间,因此忽略了训练前2步耗费的时间 + + end 训练结束时间 + + +## 2.训练过程 + +### 2.1 训练准备 + +#### 2.1.1 环境要求 ++ python3.7.5、tensorflow 1.13(gpu版)、anconda3。建议在conda环境训练 ++ 可以在线拉取github.com的代码 ++ 执行nvidia-smi,检查当前设备是否使用。若使用请终止相关进程 +#### 2.1.2 bert模型下载 +到bert[官网](https://github.com/google-research/bert) "Pre-trained models"小节选择合适的bert模型下载到本地并解压使用。 + +bert large配置包信息: ++ 配置包规格:BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M ++ 压缩包名称:uncased-L-24_H-1024_A-16.zip ++ 下载地址:https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip + +说明:本训练用到该配置包中的bert_config.json、vocab.txt文档。其它暂不涉及。 + + +### 2.2 修改预训练执行配置 +#### 2.2.1 修改配置文件 +执行`vim Ais-Bench-Stubs-aarch64/code/config/config_bert.sh`,修改配置。 + ++ BERT_CONFIG_DIR 修改为bert 配置目录 ++ TRAIN_STEPS 训练步数 ++ CUDA_VISIBLE_DEVICES 修改为指定设备ID。只有一位数字时为单卡训练。多卡同时使用时,逗号间隔,比如“0,1" + +#### 2.2.2 修改run.sh +执行`vim Ais-Bench-Stubs-aarch64/code/run.sh`, 适当修改train_run_cmd变量中预训练参数。 +### 2.3. 执行训练测试 +进入工作目录Ais-Bench-Stubs-aarch64, 执行以下指令进行本地训练: +``` +cd Ais-Bench-Stubs-aarch64 +rm -rf output + +./ais-bench-stubs test +``` + + +### 2.4 本地训练结果 + +训练过程,屏幕会有日志输出。训练结束会打印如下2个信息: +#### 2.4.1 train_result_info信息 +该信息包括了精度accuracy和吞吐率throughput_retio。 + +``` +[2021-8-4 12:35:39][INFO]train_result_info: { +"accruacy" : "0.05533597", +"throughput_ratio" : "2.456748253218857", +... +} +``` +#### 2.4.1 tensorflow回调函数打印的吞吐率 +``` +actual callback_throught_rate: 2.7236 +``` + + +### 2.5.训练执行注意事项 ++ 训练环境需要能联网,方便在线下载代码和bert模型 ++ 预训练参数max_seq_length越大,越容易造成预训练过程的资源耗尽问题。请选择合适参数 ++ 中断训练时,需要执行nvidia-smi,查到当前执行的进程ID,强行杀死该进程,避免影响下一次训练 ++ 每次训练前清空run_pretrain.py执行的output目录(ais-bench-stubs同级目录),会确保预训练时tensorflow日志sample/sec打印出来 diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/build.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..4af8c96310e27b37e91d790a80bf089e70ea1aac --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/config/config_pretrain.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/config/config_pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..eb5006cb9a9fb427cf78a756c484bafb1f790c4f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/config/config_pretrain.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7.5 +export BERT_CONFIG_DIR=/path_to_bert_model_folder +export TRAIN_DATA_PATH=/path_to_bert_data_file +export TRAIN_STEPS=200 +export CUDA_VISIBLE_DEVICES="0" +export BATCH_SIZE=1 +export MAX_SEQ_LENGTH=512 + +# bmc info get power info from bmc +export BMC_IP="" +export BMC_USER="Administrator" +export BMC_PASSWORD="" diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/master.patch b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/master.patch new file mode 100644 index 0000000000000000000000000000000000000000..8918fe686722dcc039aa84624c5c585a723f1eb4 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/master.patch @@ -0,0 +1,35 @@ +diff -Nur -x '.git*' origin/run_pretraining.py code/run_pretraining.py +--- origin/run_pretraining.py 2021-09-02 13:52:16.480000000 +0800 ++++ code/run_pretraining.py 2021-09-02 13:52:16.488000000 +0800 +@@ -22,6 +22,8 @@ + import modeling + import optimization + import tensorflow as tf ++import time ++import ais_utils + + flags = tf.flags + +@@ -463,7 +465,13 @@ + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) ++ start = time.process_time() + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) ++ end = time.process_time() ++ data_sum = FLAGS.train_batch_size * FLAGS.num_train_steps ++ throught_rate = ais_utils.calc_throughput_rate(data_sum, (int)(end - start)) ++ ais_utils.set_result("training", "throughput_ratio", throught_rate) ++ print("start:{} end:{} batchsize:{} train_steps:{} datasum:{} start:{} end:{} duration:{} throught_rate:{}".format(start, end, FLAGS.train_batch_size, FLAGS.num_train_steps, data_sum, start, end, (end-start), throught_rate)) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") +@@ -485,6 +493,8 @@ + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ++ if "masked_lm_accuracy" in result: ++ ais_utils.set_result("training", "accuracy", result["masked_lm_accuracy"]) + + if __name__ == "__main__": + flags.mark_flag_as_required("input_file") diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/patch.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..1b0418a61bc8b3f5dc20895335900a8356d6200a --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/patch.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +git_url="https://github.com/google-research/bert.git" +git_url="https://gitee.com/lanhaibo4/bert.git" +branch="master" + +master_commitid="eedf5716ce1268e56f0a50264a88cafad334ac61" +bert_sub_dir="bert" + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +get_bert_base_code_by_git() { + git clone $git_url -b $branch + cd $bert_sub_dir + git reset --hard $commitid + cd - +} + +make_patch() { + cd $BUILD_TMP_PATH + get_bert_base_code_by_git + cp $bert_sub_dir -rf $BUILD_TMP_PATH/origin + cp $target_dir -rf $BUILD_TMP_PATH/code + + diff -Nur -x ".git*" origin code >$BUILD_TMP_PATH/$branch.patch + + cp $BUILD_TMP_PATH/$branch.patch $CUR_PATH/ +} + +load_code() { + cd $BUILD_TMP_PATH + get_bert_base_code_by_git + cp $bert_sub_dir -rf $BUILD_TMP_PATH/origin + cp $bert_sub_dir -rf $BUILD_TMP_PATH/code + + patch -p0 <$CUR_PATH/$branch.patch + + [ ! -d $target_patchcode_dir ] || rm -rf $target_patchcode_dir + mkdir $target_patchcode_dir + cp $BUILD_TMP_PATH/code/* -rf $target_patchcode_dir/ +} + +main() { + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ]; then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + commitid=$master_commitid + + echo "patch.sh run type:$1 branch:$branch" + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$1" == "mkpatch" ]; then + make_patch + elif [ "$1" == "loadcode" ]; then + load_code + else + echo "null op" + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..caf55318a5aa88cbe78439c8dcd9418d31e0eea0 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/scripts/benchmark.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_invalid_args=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CUR_PATH/common/log_util.sh +. $CUR_PATH/common/common.sh +. $CUR_PATH/common/calc_power.sh +. $CUR_PATH/common/calc_resourceinfo.sh + +# 设置配置文件中的环境变量 +set_config_env() { + export PYTHONPATH=$PYTHONPATH:${CUR_PATH}:${CUR_PATH}/code + source $CUR_PATH/config/config_pretrain.sh +} + +# 环境变量检查 +check_env() { + return +} + +# 文件路径和环境依赖检查 +check_sys() { + check_path_valid $BERT_CONFIG_DIR || { logger_Warn "BERT_CONFIG_DIR:${BERT_CONFIG_DIR} not valid path"; return $ret_invalid_args; } + logger_Debug "train path valid" + + check_file_valid $TRAIN_DATA_PATH || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path"; return $ret_invalid_args; } + logger_Debug "train data file path valid" + + #check_python_version || { logger_Warn "python version not match"; return $ret_invalid_args; } + #logger_Debug "python version valid" + + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install"; return $ret_invalid_args; } + logger_Debug "python packet tensorflow valid" +} + +main() { + set_config_env + + check_env + + check_sys || exit $ret_invalid_args + + calc_powerinfo_backgroud + + device_group=(0) + run_resourceinfo_monitor_backgroud_gpu + + bash $CUR_PATH/run.sh + + calc_runing_resourceinfo_gpu $CUR_PATH/ais_utils.py $device_group + + set_powerinfo +} + +main "$@" |& tee "${BASE_PATH}/log/detail.log" +exit $? diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/scripts/run.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/scripts/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..78da140ff80d299cdebbb61d93109090a68dea80 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_bert/scripts/run.sh @@ -0,0 +1,48 @@ +#!/bin/bash +declare -i ret_invalid_args=1 +declare -i ret_train_failed=2 +CUR_PATH=$(dirname $(readlink -f "$0")) +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CUR_PATH/common/log_util.sh +. $CUR_PATH/common/common.sh + +ulimit -u unlimited + +run_train() { + # create work path + rm -rf $BASE_PATH/work + mkdir -p $BASE_PATH/work + cp $CUR_PATH/code/* $BASE_PATH/work -rf + + train_run_cmd="python3 -u $BASE_PATH/work/run_pretraining.py \ + --input_file=$TRAIN_DATA_PATH \ + --output_dir=$BASE_PATH/work \ + --do_train=True \ + --do_eval=True \ + --bert_config_file=$BERT_CONFIG_DIR/bert_config.json \ + --max_seq_length=$MAX_SEQ_LENGTH \ + --max_predictions_per_seq=76 \ + --num_train_steps=$TRAIN_STEPS \ + --num_warmup_steps=0 \ + --train_batch_size=$BATCH_SIZE + " + + logger_Info "train run cmd:$train_run_cmd" + $train_run_cmd || { ret=$ret_train_failed;logger_Warn "run train failed ret:$?"; } + logger_Info "train run done cmd:$train_run_cmd" + return $ret +} + +main() { + python3 $CUR_PATH/ais_utils.py set_result "training" "proc_start_time" $(date "+%Y-%m-%d %H:%M:%S") + run_train || { logger_Warn "train failed ret:$?"; return $ret_train_failed; } + python3 $CUR_PATH/ais_utils.py set_result "training" "proc_end_time" $(date "+%Y-%m-%d %H:%M:%S") + callback_throught_rate=`cat ${BASE_PATH}/log/detail.log | grep "examples/sec:" | tail -n 1 | awk -F ' ' '{print $NF}'` + echo "actual callback_throught_rate:$callback_throught_rate" + + python3 $CUR_PATH/ais_utils.py set_result "training" "result" "OK" +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/README.MD b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..57ac828ca6048dec59c4a689077a985c194dc20f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/README.MD @@ -0,0 +1,55 @@ +## 1. 背景 + +### 1.1 resnet模型基准来源: + +https://github.com/tensorflow/models/tree/r1.13.0/official/resnet + +### 1.2 训练指标计算说明 + +- accuracy 训练结果中accuracy的值 +- throughput_ratio 计算公式--imagenet数据集图片数目 * epoch_size / 训练时间(包括训练结果评估时间) + +-- imagenet数据集图片数目 全部数据集 1280000个 + +## 2.训练过程 + +### 2.1 训练准备 + +#### 2.1.1 环境要求 ++ python3.7.5、tensorflow 1.13(gpu版)、anconda3 ++ 执行nvidia-smi,检查当前设备是否使用。若使用请终止相关进程 +#### 2.1.2 imagenet tensorflow数据集下载 +到resnete官网下载imagenet tensorflow类型的数据集 + +### 2.2 修改预训练执行配置 +执行`vim Ais-Bench-Stubs-aarch64/code/config/config_imagenet2012.sh`, 修改配置。 ++ TRAIN_DATA_PATH 修改为resnet训练数据集目录 ++ EPOCH_SIZE epoch 数目 ++ RESNET_SIZE resnet层数 ++ CUDA_VISIBLE_DEVICES 指定设备进行训练。"1"表示仅使用设备1训练。"0,1",表示同时设备0和1进行训练。 + +### 2.3. 执行训练测试 +进入工作目录, 执行以下指令进行本地训练: +``` +cd Ais-Bench-Stubs-aarch64 + +./ais-bench-stubs test +``` + +### 2.4 本地训练结果 + +训练过程,屏幕会有日志输出。训练结束会打印train_result_info信息,包括了精度accuracy和吞吐率throughput_ratio。 + +``` +[2021-8-3 14:35:39][INFO]train_result_info: { +"accruacy" : "0.11714", +"throughput_ratio" : "76.54648206089946", +... +} +``` +说明:当前结果是基于 P100 单卡的 1个epoch的训练结果 + + +### 4.5.训练执行注意事项 ++ 训练环境需要能联网,方便在线下载代码和resnet数据集 ++ 中断训练时,需要执行nvidia-smi,查到当前执行的进程ID,强行杀死该进程,避免影响下一次训练 diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/build.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..4af8c96310e27b37e91d790a80bf089e70ea1aac --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +declare -i ret_ok=0 +declare -i ret_error=1 + +CURDIR=$(dirname $(readlink -f $0)) + +function main() +{ + rm -rf ${CURDIR}/output/* + mkdir -p ${CURDIR}/output + echo "build call args:$@" + bash $CURDIR/patch.sh loadcode "$@" || { echo "warn run patch failed"; return 1; } + cp -rf ${CURDIR}/patchcode ${CURDIR}//output/code + cp ${CURDIR}/scripts/* ${CURDIR}//output/ + cp ${CURDIR}/../../../common -r ${CURDIR}//output/ + cp ${CURDIR}/config -r ${CURDIR}//output/ +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/config/config_imagenet2012.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/config/config_imagenet2012.sh new file mode 100644 index 0000000000000000000000000000000000000000..fe8b0585091ce2c972dd832f93a40eee46b814d9 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/config/config_imagenet2012.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export PYTHON_COMMAND=python3.7.5 +export TRAIN_DATA_PATH=/path_to_train_data + +export EPOCH_SIZE=1 + +export RESNET_SIZE=50 +export CUDA_VISIBLE_DEVICES="0" +# bmc info get power info from bmc +export BMC_IP="" +export BMC_USER="Administrator" +export BMC_PASSWORD="" + diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/patch.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/patch.sh new file mode 100644 index 0000000000000000000000000000000000000000..edfe17c707f2748fe1ba71d1e5ed15fbf1ef2131 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/patch.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +declare -i ret_error=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) + +git_url="https://github.com/tensorflow/models.git" +git_url="https://gitee.com/lanhaibo4/models.git" + +branch="r1.13.0" + +r1_13_0_commitid="57e075203f8fba8d85e6b74f17f63d0a07da233a" + +modelzoo_sub_dir="models/official" + +target_dir=$CUR_PATH/code +target_patchcode_dir=$CUR_PATH/patchcode + +get_modelzoo_base_code_by_git() { + git clone $git_url -b $branch + cd models + git reset --hard $commitid + cd - +} + +make_patch() { + cd $BUILD_TMP_PATH + get_modelzoo_base_code_by_git + mkdir $BUILD_TMP_PATH/origin + mkdir $BUILD_TMP_PATH/code + cp $modelzoo_sub_dir -rf $BUILD_TMP_PATH/origin/ + cp $target_dir/* -rf $BUILD_TMP_PATH/code/ + + diff -Nur -x ".git*" origin code >$BUILD_TMP_PATH/$branch.patch + + cp $BUILD_TMP_PATH/$branch.patch $CUR_PATH/ +} + +load_code() { + cd $BUILD_TMP_PATH + get_modelzoo_base_code_by_git + mkdir $BUILD_TMP_PATH/origin + mkdir $BUILD_TMP_PATH/code + cp $modelzoo_sub_dir -rf $BUILD_TMP_PATH/origin/ + cp $modelzoo_sub_dir -rf $BUILD_TMP_PATH/code/ + + patch -p0 <$CUR_PATH/$branch.patch + + [ ! -d $target_patchcode_dir ] || rm -rf $target_patchcode_dir + mkdir $target_patchcode_dir + rm -rf $BUILD_TMP_PATH/code/research + cp $BUILD_TMP_PATH/code/* -rf $target_patchcode_dir/ +} + +main() { + if [ "$1" != "mkpatch" -a "$1" != "loadcode" ]; then + echo "target not valid in:[$1] not match [mkpatch loadcode]" + return $ret_error + fi + + commitid=$r1_13_0_commitid + + echo "patch.sh run type:$1 branch:$branch" + + BUILD_TMP_PATH=$CUR_PATH/buildtmp + [ ! -d $BUILD_TMP_PATH ] || rm -rf $BUILD_TMP_PATH + mkdir -p $BUILD_TMP_PATH + + if [ "$1" == "mkpatch" ]; then + make_patch + elif [ "$1" == "loadcode" ]; then + load_code + else + echo "null op" + fi +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/r1.13.0.patch b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/r1.13.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..2e8b20a3b67d0d6baec5a374535f1d3d644ef7b2 --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/r1.13.0.patch @@ -0,0 +1,59 @@ +diff -Nur -x '.git*' origin/official/resnet/resnet_run_loop.py code/official/resnet/resnet_run_loop.py +--- origin/official/resnet/resnet_run_loop.py 2021-09-03 18:05:29.200000000 +0800 ++++ code/official/resnet/resnet_run_loop.py 2021-09-03 18:05:29.220000000 +0800 +@@ -27,6 +27,7 @@ + import math + import multiprocessing + import os ++import time + + # pylint: disable=g-bad-import-order + from absl import flags +@@ -41,7 +42,12 @@ + from official.resnet import imagenet_preprocessing + from official.utils.misc import distribution_utils + from official.utils.misc import model_helpers ++import ais_utils + ++NUM_IMAGES = { ++ 'train': 1281167, ++ 'validation': 50000, ++} + + ################################################################################ + # Functions for input processing. +@@ -478,7 +484,7 @@ + run_config = tf.estimator.RunConfig( + train_distribute=distribution_strategy, + session_config=session_config, +- save_checkpoints_secs=60*60*24) ++ save_checkpoints_secs=60*60*24, keep_checkpoint_max=1) + + # Initializes model with all but the dense layer from pretrained ResNet. + if flags_obj.pretrained_model_checkpoint_path is not None: +@@ -560,8 +566,15 @@ + tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) + + if num_train_epochs: ++ start = time.process_time() + classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), + hooks=train_hooks, max_steps=flags_obj.max_train_steps) ++ end = time.process_time() ++ data_sum = NUM_IMAGES['train'] * flags.FLAGS.train_epochs ++ throughput_rate = ais_utils.calc_throughput_rate(data_sum, (int)(end - start)) ++ ais_utils.set_result("training", "throughput_ratio", throughput_rate) ++ print("starttime: {} endtime:{} image_number: {} epoch_size: {} throughput_ratio: {}".format(start, \ ++ end, NUM_IMAGES['train'], flags.FLAGS.train_epochs, throughput_rate)) + + tf.logging.info('Starting to evaluate.') + +@@ -573,6 +586,9 @@ + # global_step count. + eval_results = classifier.evaluate(input_fn=input_fn_eval, + steps=flags_obj.max_train_steps) ++ print("eval_results: {}".format(eval_results)) ++ if 'accuracy' in eval_results: ++ ais_utils.set_result('training', 'accuracy', eval_results['accuracy']) + + benchmark_logger.log_evaluation_result(eval_results) + diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/scripts/benchmark.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..9bd23572207ea95548cf927b063c5fecc1f7ea8f --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/scripts/benchmark.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# 返回码 +declare -i ret_ok=0 +declare -i ret_invalid_args=1 + +CUR_PATH=$(dirname $(readlink -f "$0")) +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CUR_PATH/common/log_util.sh +. $CUR_PATH/common/common.sh +. $CUR_PATH/common/calc_power.sh +. $CUR_PATH/common/calc_resourceinfo.sh + +# 设置配置文件中的环境变量 +set_config_env() +{ + export PYTHONPATH=$PYTHONPATH:${CUR_PATH}:${CUR_PATH}/code + + source $CUR_PATH/config/config_imagenet2012.sh +} + +# 环境变量检查 +check_env() +{ + : "${EPOCH_SIZE?EPOCH_SIZE not set}" +} + +# 文件路径和环境依赖检查 +check_sys() +{ + check_path_valid $TRAIN_DATA_PATH || { logger_Warn "TRAIN_DATA_PATH:${TRAIN_DATA_PATH} not valid path" ; return $ret_invalid_args; } + logger_Debug "train path valid" + #check_path_valid $TEST_DATA_PATH || { logger_Warn "TEST_DATA_PATH:${TEST_DATA_PATH} not valid path" ; return $ret_invalid_args; } + #logger_Debug "test path valid" + + #check_python_version || { logger_Warn "python version not match" ; return $ret_invalid_args; } + #logger_Debug "python version valid" + + check_python_package_is_install ${PYTHON_COMMAND} "tensorflow" || { logger_Warn "tensorflow package not install" ; return $ret_invalid_args;} + logger_Debug "python packet tensorflow valid" +} + +main() +{ + set_config_env + + check_env + + check_sys || exit $ret_invalid_args + + calc_powerinfo_backgroud + + device_group=(0) + run_resourceinfo_monitor_backgroud_gpu + + bash $CUR_PATH/run.sh + + calc_runing_resourceinfo_gpu $CUR_PATH/ais_utils.py $device_group + + set_powerinfo +} + +main "$@" +exit $? diff --git a/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/scripts/run.sh b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/scripts/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ccc4f3c10f56bc1b1827fbe4828e6e20adc83de --- /dev/null +++ b/huawei/ais-bench_workload/src/train/nvidia/train_tensorflow_resnet/scripts/run.sh @@ -0,0 +1,39 @@ +#!/bin/bash +declare -i ret_invalid_args=1 +CUR_PATH=$(dirname $(readlink -f "$0")) +export BASE_PATH=$(cd "$CUR_PATH/../";pwd) + +. $CUR_PATH/common/log_util.sh +. $CUR_PATH/common/common.sh + +ulimit -u unlimited + +run_train(){ + # create work path + rm -rf $BASE_PATH/work + mkdir -p $BASE_PATH/work + cp $CUR_PATH/code/* $BASE_PATH/work/ -rf + + train_run_cmd="python -u $BASE_PATH/work/official/resnet/imagenet_main.py \ + --data_dir=${TRAIN_DATA_PATH} \ + --train_epochs=$EPOCH_SIZE \ + --resnet_size=$RESNET_SIZE \ + --model_dir=$BASE_PATH/work \ + --epochs_between_evals=$EPOCH_SIZE \ + " + $train_run_cmd + cp -f $BASE_PATH/work/model.ckpt-* $BASE_PATH/result + logger_Info "train run done" +} + +main() +{ + python $CUR_PATH/ais_utils.py set_result "training" "proc_start_time" `date "+%Y-%m-%d %H:%M:%S"` + + run_train + python $CUR_PATH/ais_utils.py set_result "training" "proc_end_time" `date "+%Y-%m-%d %H:%M:%S"` + python3 $CUR_PATH/ais_utils.py set_result "training" "result" "OK" +} + +main "$@" +exit $?