diff --git a/models/cv/classification/swin_transformer_large/ixrt/README.md b/models/cv/classification/swin_transformer_large/ixrt/README.md index f09a5c125c0dc5a021aa2d2d1fd25961d08bf02b..032c961d88878d66c4c830cac3b3273545d80295 100644 --- a/models/cv/classification/swin_transformer_large/ixrt/README.md +++ b/models/cv/classification/swin_transformer_large/ixrt/README.md @@ -10,8 +10,6 @@ Swin Transformer-Large is a variant of the Swin Transformer, an architecture des | :----: | :----: | :----: | | MR-V100 | 4.2.0 | 25.03 | -**This model is compatible with IXUCA SDK up to version 4.2.0.** - ## Model Preparation ### Prepare Resources @@ -53,6 +51,7 @@ python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/swin-large/s ```bash git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ export ORIGIN_ONNX_NAME=./swin-large-torch-fp32 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py diff --git a/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh b/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh index b7fe2e695819ea348d0045c5774cc5e7af8037f2..02ac2c462036a5839f09d7448278098897d21679 100644 --- a/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh +++ b/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh @@ -26,6 +26,8 @@ else fi apt install -y libnuma-dev +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ pip install -r requirements.txt mkdir -p general_perf/model_zoo/regular diff --git a/models/nlp/plm/albert/ixrt/README.md b/models/nlp/plm/albert/ixrt/README.md index 5944c1d15e499710580328e7b981568e83916586..778719bddff35be6d4fc5136b18e6efcc3d96da5 100644 --- a/models/nlp/plm/albert/ixrt/README.md +++ b/models/nlp/plm/albert/ixrt/README.md @@ -10,8 +10,6 @@ Albert (A Lite BERT) is a variant of the BERT (Bidirectional Encoder Representat | :----: | :----: | :----: | | MR-V100 | 4.2.0 | 25.03 | -**This model is compatible with IXUCA SDK up to version 4.2.0.** - ## Model Preparation ### Prepare Resources @@ -51,6 +49,7 @@ onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx ```bash git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ export ORIGIN_ONNX_NAME=./albert-torch-fp32-sim export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py diff --git a/models/nlp/plm/albert/ixrt/ci/prepare.sh b/models/nlp/plm/albert/ixrt/ci/prepare.sh index 68e8aa19da2132447fdfe6ea48f42bc026f48d7c..9e0dc3b925183fc0ca18848d3dd31cdec4bdf2f1 100644 --- a/models/nlp/plm/albert/ixrt/ci/prepare.sh +++ b/models/nlp/plm/albert/ixrt/ci/prepare.sh @@ -21,6 +21,8 @@ apt install -y libnuma-dev pip3 install -r requirements.txt cp /root/data/3rd_party/albert-torch-fp32.json ./ +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ python3 torch2onnx.py --model_path /root/data/checkpoints/open_albert/albert-base-squad.pt --output_path albert-torch-fp32.onnx onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx diff --git a/models/nlp/plm/deberta/ixrt/README.md b/models/nlp/plm/deberta/ixrt/README.md index b683a4cbde82ad5c7fa2d7964824ba2f6489afee..87496848406c894ec31d9886f4bbc6c6123980c1 100644 --- a/models/nlp/plm/deberta/ixrt/README.md +++ b/models/nlp/plm/deberta/ixrt/README.md @@ -15,8 +15,6 @@ fine-tuning to better suit specific downstream tasks, thereby improving the mode | :----: | :----: | :----: | | MR-V100 | 4.2.0 | 25.03 | -**This model is compatible with IXUCA SDK up to version 4.2.0.** - ## Model Preparation ### Prepare Resources @@ -55,6 +53,7 @@ python3 remove_clip_and_cast.py ```bash git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ export ORIGIN_ONNX_NAME=./deberta-sim-drop-clip-drop-invaild-cast export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py diff --git a/models/nlp/plm/deberta/ixrt/ci/prepare.sh b/models/nlp/plm/deberta/ixrt/ci/prepare.sh index d440393e7ed913ae6a92fc0ab043a5744086f8c1..23ecd2b5bc02b6076db66490f28ab18efe07b86f 100644 --- a/models/nlp/plm/deberta/ixrt/ci/prepare.sh +++ b/models/nlp/plm/deberta/ixrt/ci/prepare.sh @@ -21,6 +21,8 @@ apt install -y libnuma-dev pip install -r requirements.txt cp /root/data/3rd_party/deberta-torch-fp32.json ./ +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ python3 torch2onnx.py --model_path /root/data/checkpoints/open_deberta/deberta-base-squad.pt --output_path deberta-torch-fp32.onnx onnxsim deberta-torch-fp32.onnx deberta-torch-fp32-sim.onnx diff --git a/models/nlp/plm/roberta/ixrt/README.md b/models/nlp/plm/roberta/ixrt/README.md index acd1b45869ad0103681a7e65488071e52494576f..92cc8e4eb8dfbb8e3490eab6aabaf38134c731b9 100644 --- a/models/nlp/plm/roberta/ixrt/README.md +++ b/models/nlp/plm/roberta/ixrt/README.md @@ -17,8 +17,6 @@ our models and code. | :----: | :----: | :----: | | MR-V100 | 4.2.0 | 25.03 | -**This model is compatible with IXUCA SDK up to version 4.2.0.** - ## Model Preparation ### Prepare Resources @@ -62,6 +60,7 @@ onnxsim open_roberta/roberta-torch-fp32.onnx open_roberta/roberta-torch-fp32_sim ```bash git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ export ORIGIN_ONNX_NAME=./open_roberta/roberta-torch-fp32_sim export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py diff --git a/models/nlp/plm/roberta/ixrt/ci/prepare.sh b/models/nlp/plm/roberta/ixrt/ci/prepare.sh index 81d02ab0621e5c06580fe8469fc9c2012ca3c3ee..5f00f9e9ac7096d7d17d9c1a50cd416c6db432de 100644 --- a/models/nlp/plm/roberta/ixrt/ci/prepare.sh +++ b/models/nlp/plm/roberta/ixrt/ci/prepare.sh @@ -19,6 +19,8 @@ set -x apt install -y libnuma-dev pip install -r requirements.txt +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ mkdir -p data cp -r /root/data/checkpoints/open_roberta data/ diff --git a/models/nlp/plm/roformer/ixrt/README.md b/models/nlp/plm/roformer/ixrt/README.md index 890158fa2f42669484032186d1333b1187a9a860..5d37b5e6eb6ac8d7c0ce107ee5b248e64ba96a11 100644 --- a/models/nlp/plm/roformer/ixrt/README.md +++ b/models/nlp/plm/roformer/ixrt/README.md @@ -19,8 +19,6 @@ datasets. | :----: | :----: | :----: | | MR-V100 | 4.2.0 | 25.03 | -**This model is compatible with IXUCA SDK up to version 4.2.0.** - ## Model Preparation ### Prepare Resources @@ -68,6 +66,7 @@ python3 deploy.py --model_path ./data/open_roformer/roformer-frozen.onnx --outpu ```bash git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ export ORIGIN_ONNX_NAME=./data/open_roformer/roformer-frozen export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py diff --git a/models/nlp/plm/roformer/ixrt/ci/prepare.sh b/models/nlp/plm/roformer/ixrt/ci/prepare.sh index ea80462db022331cb8b9c20f12a15e9ef8b0bdd6..deda09efeb451ceafa37daf0b0f519e209e9249f 100644 --- a/models/nlp/plm/roformer/ixrt/ci/prepare.sh +++ b/models/nlp/plm/roformer/ixrt/ci/prepare.sh @@ -19,6 +19,8 @@ set -x apt install -y libnuma-dev pip install -r requirements.txt +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ mkdir -p data cp -r /root/data/checkpoints/open_roformer data/ diff --git a/models/nlp/plm/roformer/ixrt/export_onnx.py b/models/nlp/plm/roformer/ixrt/export_onnx.py index 475dddd7c2ab27b6ca342be98ea92d2c791ff60b..a0213bb449c7d632fdda2b43279037d6883f3424 100644 --- a/models/nlp/plm/roformer/ixrt/export_onnx.py +++ b/models/nlp/plm/roformer/ixrt/export_onnx.py @@ -16,7 +16,7 @@ import tf2onnx from tf2onnx import tf_loader import argparse -ONNX_OPSET = 11 +ONNX_OPSET = 13 def _convert_graphdef_to_onnx(graph_def, inputs=None, diff --git a/models/nlp/plm/videobert/ixrt/README.md b/models/nlp/plm/videobert/ixrt/README.md index 2f47a69bf90d4bc9d3e04fc17a457e260a6530c4..ded0114471da00dded55f6910c833998411cba4c 100644 --- a/models/nlp/plm/videobert/ixrt/README.md +++ b/models/nlp/plm/videobert/ixrt/README.md @@ -12,8 +12,6 @@ and textual information into a unified framework. | :----: | :----: | :----: | | MR-V100 | 4.2.0 | 25.03 | -**This model is compatible with IXUCA SDK up to version 4.2.0.** - ## Model Preparation ### Prepare Resources @@ -43,6 +41,7 @@ pip3 install -r requirements.txt ```bash git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ export ORIGIN_ONNX_NAME=./general_perf/model_zoo/popular/open_videobert/video-bert export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py diff --git a/models/nlp/plm/videobert/ixrt/ci/prepare.sh b/models/nlp/plm/videobert/ixrt/ci/prepare.sh index 0d46c6c023fc58658a230714d3a1b06cc9430c2b..7d5f8fa49779ce6d6b52d088cca2ad0ce4a9dd5a 100644 --- a/models/nlp/plm/videobert/ixrt/ci/prepare.sh +++ b/models/nlp/plm/videobert/ixrt/ci/prepare.sh @@ -19,6 +19,8 @@ set -x apt install -y libnuma-dev pip install -r requirements.txt +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 +cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/ mkdir -p data cp -r /root/data/checkpoints/open_videobert data/ diff --git a/tests/model_info.json b/tests/model_info.json index e230305fff2fbed1fb15d7f585ed35b9ac71d43d..58adf01b0326d3a63a27d3bf3b1d31ebde2b15d3 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -3003,8 +3003,8 @@ "release_version": "25.03", "release_sdk": "CoreX 4.2.0", "release_gpgpu": "MR-V100", - "latest_sdk": "4.2.0", - "latest_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", "category": "cv/classification", "toolbox": "", "mdims": "", @@ -5827,8 +5827,8 @@ "release_version": "24.09", "release_sdk": "4.1.2", "release_gpgpu": "MR-V100", - "latest_sdk": "4.2.0", - "latest_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", "category": "nlp/plm", "toolbox": "", "mdims": "", @@ -6025,8 +6025,8 @@ "release_version": "24.09", "release_sdk": "4.1.2", "release_gpgpu": "MR-V100", - "latest_sdk": "4.2.0", - "latest_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", "category": "nlp/plm", "toolbox": "", "mdims": "", @@ -6058,8 +6058,8 @@ "release_version": "24.09", "release_sdk": "4.1.2", "release_gpgpu": "MR-V100", - "latest_sdk": "4.2.0", - "latest_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", "category": "nlp/plm", "toolbox": "", "mdims": "", @@ -6091,8 +6091,8 @@ "release_version": "24.09", "release_sdk": "4.1.2", "release_gpgpu": "MR-V100", - "latest_sdk": "4.2.0", - "latest_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", "category": "nlp/plm", "toolbox": "", "mdims": "", @@ -6124,8 +6124,8 @@ "release_version": "24.09", "release_sdk": "4.1.2", "release_gpgpu": "MR-V100", - "latest_sdk": "4.2.0", - "latest_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", "category": "nlp/plm", "toolbox": "", "mdims": "", diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py index eb25acab7388ad14c509fd48a0862ff0bbec7f32..df6f59e122c6e529d05b1ff7fc20f6ea46fd35e6 100644 --- a/tests/run_ixrt.py +++ b/tests/run_ixrt.py @@ -189,7 +189,7 @@ def run_clf_testcase(model): script = f""" cd ../{model['model_path']} export ORIGIN_ONNX_NAME=./swin-large-torch-fp32 - export OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py + export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py export PROJ_PATH=./ bash scripts/infer_swinl_fp16_performance.sh cd ./ByteMLPerf/byte_infer_perf/general_perf @@ -450,7 +450,7 @@ def run_nlp_testcase(model): set -x cd ../{model['model_path']} export ORIGIN_ONNX_NAME=./data/open_{model_name}/{model_name} - export OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py + export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py export PROJ_PATH=./ bash scripts/infer_{model_name}_{prec}_performance.sh cd ./ByteMLPerf/byte_infer_perf/general_perf diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md deleted file mode 100755 index 3d1318032a7b03971285a05b997d3275c0d3c3cf..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md +++ /dev/null @@ -1,114 +0,0 @@ -# IxRT optimizer - -## 1. optimizer 简介 - -`optimizer` 是一个 ixrt 中集成的图融合工具,用于将onnx图中的op融合成对应的IxRT plugin,一般与 IxRT 配合进行使用; - -## 2. optimizer 功能说明 - -| 功能 | 说明 | -| ---------- | ------------------------------------------------------------ | -| 动态图支持 | 支持融合动态图和静态图 | -| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert, yolov5s, visionTransformer, gpt2模型,其他模型暂不推荐使用该工具 | - -## 3. optimizer 运行参数 - -| 参数 | 说明 | -| ---------------- | ------------------------------------------------------------ | -| `--onnx` | 必选 ,指定要运行的 onnx 模型路径 | -| `--num_heads` | 可选 ,指定模型对应Attention模块注意力头的个数 | -| `--hidden_size` | 可选, 模型模型隐藏层的大小 | -| `--input_shapes` | 可选 ,固定动态模型的输入形状,以从静态形状推理,示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 | -| `--dump_onnx` | 可选 ,用于图融合过程中dump出中间的onnx图,生成 _sim 结尾的 onnx 模型 | -| `--model_type` | 可选 ,可以指定要融合的模型类型,默认是"bert", 可选["bert", "swint", "roformer", "yolo", "gpt2", "vit"] | -| `--log_level` | 可选 ,指定IxRT运行时显示日志的等级, 可指定为debug、info、error,默认为 info | - - -## 4. 运行示例 - -### 4.1 示例1:融合albert|videobert|roberta|deberta - -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} -``` - -### 4.2 示例2:融合swinL - -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint -``` - -### 4.3 示例3:融合roformer - -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer -``` - -### 4.4 示例4:融合yolov5s - -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} --model_type yolo -``` - -### 4.5 精度验证 - -#### 4.5.1 示例1:albert模型 - -模型变量示例: - -``` -MODEL_PATH="data/albert/albert-base-squad.onnx" -MODEL_END_PATH="data/albert/albert-base-squad_end.onnx" -MODEL_ENGINE_PATH="data/albert/albert-base-squad_end.engine" -``` - -运行命令 - -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} --dump_onnx -ixrtexec --onnx ${MODEL_END_PATH} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \ - --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \ - --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \ - --save_engine ${MODEL_ENGINE_PATH} --log_level verbose --plugins ixrt_plugin -ixrtexec --load_engine ${MODEL_ENGINE_PATH} --ort_onnx ${MODEL_PATH} --plugins ixrt_plugin --verify_acc -``` - -#### 4.5.2 示例2:swinL模型 - -模型变量示例: - -``` -BS=1 -MODEL_PATH="data/swint/swin-transformer-large.onnx" -MODEL_END_PATH = "data/swint/swin-transformer-large_end.onnx" -MODEL_ENGINE_PATH = "data/swint/swin-transformer-large_end.engine" -MODEL_SIM_STATIC_SIM_PATH = "data/swint/swin-transformer-large_sim_static_sim.onnx" -``` - -运行命令 - -```bash -cd oss/tools/optimizer -# 固定输入形状为 ${BS}x3x384x384 -python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint --dump_onnx - -# Build engine -ixrtexec --onnx ${MODEL_END_PATH} --save_engine ${MODEL_ENGINE_PATH} --log_level verbose --plugins ixrt_plugin - -# 测试性能 -ixrtexec --load_engine ${MODEL_ENGINE_PATH} --plugins ixrt_plugin - -# 测试精度 -ixrtexec --load_engine ${MODEL_ENGINE_PATH} --ort_onnx ${MODEL_SIM_STATIC_SIM_PATH} --plugins ixrt_plugin --verify_acc -``` - -请参考[高级话题](5_advanced_topics.md)中的精度对比工具一节,了解详细使用方法和原理。 - -也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md) - -具体使用方法可以参考oss/samples diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py deleted file mode 100644 index de522e5b082b122a28b0a0423a40909598aa82d5..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md deleted file mode 100644 index 65175643c0e50d8445ef65deae088de4600244f0..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md +++ /dev/null @@ -1,44 +0,0 @@ -## CI Test tool for IxRT - -### 1. Install dltest tool - - python setup.py develop - -### 2. Usage - -#### 2.1 Fetch log - -Commmand: - -```shell -ixdltest-fetch args_or_pipe ${log_path} -``` - -Arguments: - -- p or patterns, The pattern of fetch log; -- pn or pattern_names, The name of pattern; -- use_re, Whether use regular expression; -- d or nearest_distance, default=10, The nearest distance of matched pattern; -- start_flag, The flag of start to record log; -- end_flag, The flag of stop to record log; -- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line. -- split_sep, The seperator is used to split line; -- split_idx, The index of split line; -- saved, Save result to path; -- log, Log path. - -Example -Analyse from file -``` -$ ixdltest-fetch run.log -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100 -{'results': [{'Throughput': [188.5461778786721]}]} -- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0 -``` - -Analyse from command line pipe -``` -$ cat run.log | ixdltest-fetch -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100 -{'results': [{'Throughput': [188.5461778786721]}]} -- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0 -``` diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py deleted file mode 100644 index 5458f31666f11de72d52a4e834b8a87be9a992d0..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .utils.infer_args import show_infer_arguments \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py deleted file mode 100644 index 182e895c7fe902a31fc982fab6f96e0c55125c4a..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import os -from typing import List, Iterable, Optional - -from dltest.cli.log_parser_cli import LogParserCLI -from dltest.log_parser import LogParser -from dltest.model_compare_config import get_compare_config_with_full_path -from dltest.utils.misc import get_full_path -from dltest.utils.subprocess_tools import get_output -from dltest.model_compare_config import ComparatorConfig - - -FRAMEWORKS = list(ComparatorConfig.get_frameworks()) - -REMAINDER = '...' - -assertion_expr_factory = dict( - eq = "a == b", - ne = "a != b", - ge = "a >= b", - le = "a <= b", - gt = "a > b", - lt = "a < b", -) - - -class AssertCLI(LogParserCLI): - - def command_name(self): - return "assert" - - def predefine_args(self): - super(AssertCLI, self).predefine_args() - self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None, - help='It is used in assertion expression.') - self.parser.add_argument('--print_result', action="store_true", default=False, - help='Whether print result') - self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], - help='The method of capture output') - # FIXME: Using store_action to replase it - self.parser.add_argument('--only_last', type=int, default=0, - help='Whether use the last result to compare') - self.parser.add_argument('--expr', type=str, default="ge", - help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" + - ", or a executable code, such as `a > b`, `a > 1`, ...") - self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False, - help='Whether use predefined args of parser.') - self.parser.add_argument('--log', type=str, default=None, help="Log path") - self.parser.add_argument("--run_script", default=[], nargs=REMAINDER) - - def parse_args(self, *args, **kwargs): - args = super(AssertCLI, self).parse_args() - args.only_last = args.only_last > 0 - if len(args.run_script) == 0 and args.log is None: - raise ValueError("The one of `--run_script` or `--log` must be given.") - - if args.assertion_second_value is None: - if args.expr is None: - raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.") - - if args.expr in assertion_expr_factory: - raise ValueError( - "The comparison operators depend on the argument `assertion_second_value`." - ) - - return args - - def create_parser(self, args): - if args.use_predefined_parser_rules: - script_path = self._get_script_path(args.run_script) - config = get_compare_config_with_full_path(script_path, to_dict=False) - - return LogParser( - patterns=config.patterns, pattern_names=config.pattern_names, - use_re=config.use_re, nearest_distance=config.nearest_distance, - start_line_pattern_flag=config.start_line_pattern_flag, - end_line_pattern_flag=config.end_line_pattern_flag, - split_pattern=config.split_pattern, - split_sep=config.split_sep, - split_idx=config.split_idx - ) - - return LogParser( - patterns=args.patterns, pattern_names=args.pattern_names, - use_re=args.use_re, nearest_distance=args.nearest_distance, - start_line_pattern_flag=args.start_flag, - end_line_pattern_flag=args.end_flag, - split_pattern=args.split_pattern, - split_sep=args.split_sep, - split_idx=args.split_idx - ) - - def run(self): - args = self.parse_args() - parser = self.create_parser(args) - - if args.print_result: - print(args) - - output = self.get_log(args) - parsed_logs = self.parser_log(parser, output, args) - self.check_logs(parsed_logs, args) - - def get_log(self, args): - if len(args.run_script) == 0: - try: - with open(args.log) as f: - return f.readlines() - except: - print(f"ERROR: Read log fail in {args.log}") - exit(1) - else: - return get_output(args.run_script, capture_output_method=args.capture_output) - - def parser_log(self, parser, output, args) -> List[float]: - results = parser.parse(output) - if args.only_last: - results = results[-1:] - - if len(results) == 0: - raise ValueError("The parsed results is empty, please check patterns.") - if isinstance(results[0], dict): - if len(results[0]) == 0: - raise ValueError("The parsed results is empty, please check patterns.") - key = list(results[0].keys())[0] - results = [result[key] for result in results] - - if isinstance(results[0], Iterable): - results = [result[0] for result in results] - - return results - - def check_logs(self, parsed_logs, args): - if args.print_result: - print("Parsed result:", parsed_logs) - - assertion_expr = assertion_expr_factory.get(args.expr, args.expr) - - assert_results = [] - b = args.assertion_second_value - for a in parsed_logs: - assert_results.append(eval(assertion_expr)) - - if args.print_result: - print("The result of assertion expression:", assert_results) - - if any(assert_results): - print("SUCCESS") - exit(0) - print("FAIL") - exit(1) - - def _get_script_path(self, run_script: List[str]): - # Find shell script by current run_script - def _find_real_shell_script(cmd: List[str]): - for i, field in enumerate(cmd): - if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS: - return field - - real_shell_script = _find_real_shell_script(run_script) - - # Find shell script by parent process - if real_shell_script is None: - ppid = os.getppid() - import psutil - pproc = psutil.Process(ppid) - pproc_cmd = pproc.cmdline() - real_shell_script = _find_real_shell_script(pproc_cmd) - - if real_shell_script is not None: - real_shell_script = self._get_script_abs_path(real_shell_script) - return real_shell_script - - raise RuntimeError("The script is not named correctly, " + \ - "please use a script name ending with the framework, " + \ - f"got `{' '.join(run_script)}`, " + \ - "e.g. train_resnet50_torch.sh") - - def _get_framework(self, shell_script: str) -> Optional[str]: - try: - return shell_script.split('.')[-2].split('_')[-1] - except: - return None - - def _get_script_abs_path(self, run_script): - real_run_script = os.path.realpath(run_script) - if os.path.exists(real_run_script): - return real_run_script - - if "MODEL_DIR" in os.environ: - return os.path.join(os.environ["MODEL_DIR"], run_script) - - if "OLDPWD" in os.environ: - real_run_script = os.path.join(os.environ["OLDPWD"], run_script) - if os.path.exists(real_run_script): - return real_run_script - - raise FileNotFoundError("Not found running script path, " + \ - "please set environment variable `MODEL_DIR`, " + \ - "e.g /path/to/deeplearningsamples/executables/resnet.") - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py deleted file mode 100644 index b40f3a72fb949c18104963fb598c58076c65b479..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py +++ /dev/null @@ -1,56 +0,0 @@ -import os - -from .assert_cli import AssertCLI -from ..utils.subprocess_tools import execute_shell - -RUN_MODE_KEY = "RUN_MODE" -RUN_MODE_STRICT = "strict" - - -class CheckCli(AssertCLI): - - def __init__(self, *args, **kwargs): - super(CheckCli, self).__init__(*args, **kwargs) - self.args = None - - def command_name(self): - return "check" - - def predefine_args(self): - self.parser.add_argument("--check_mode", type=str, default="no", - choices=["all", "strict", "nonstrict", "no"], - help="which running mode needs to be checked") - self.parser.add_argument("--nonstrict_mode_args", type=str, default="", - help="the arguments are used with nonstric testing") - super(CheckCli, self).predefine_args() - - def parse_args(self, *args, **kwargs): - if self.args is None: - args = super(CheckCli, self).parse_args(*args, **kwargs) - args.use_predefined_parser_rules = True - args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ") - - if not self.is_strict_testing(): - args.run_script.extend(args.nonstrict_mode_args) - - if args.check_mode == "all": - args.check_mode = self.current_running_mode() - - self.args = args - return self.args - - def run(self): - args = self.parse_args() - if args.check_mode == self.current_running_mode(): - return super(CheckCli, self).run() - else: - res = execute_shell(args.run_script) - exit(res.returncode) - - def current_running_mode(self): - return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT) - - def is_strict_testing(self): - return self.current_running_mode() == RUN_MODE_STRICT - - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py deleted file mode 100644 index c631f332b6a46c43c7891e4925d011e49741dc5d..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from dltest.cli.assert_cli import AssertCLI -from dltest.cli.log_comparator_cli import LogComparatorCLI -from dltest.cli.model_validator_cli import ModelValidatorCLI -from dltest.cli.fetch_log_cli import FetchLog -from dltest.cli.check_cli import CheckCli - - -#log_comparator_cli = LogComparatorCLI() -#model_validator_cli = ModelValidatorCLI() -fetch_log_cli = FetchLog() -#assert_cli = AssertCLI() -#check_cli = CheckCli() - - -def make_execute_path(): - preffix = "dltest.cli.entry_points" - clis = [] - for cli_var in globals(): - if cli_var.endswith('_cli'): - cmd_name = globals()[cli_var].command_name() - clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}") - - return clis - - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py deleted file mode 100644 index 41f3c3cac3151b61362b3ff57609df0f64896181..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import json -import sys -from typing import Mapping -from os.path import basename, join, exists, expanduser, dirname - -from dltest.log_parser import LogParser -from dltest.cli.log_parser_cli import LogParserCLI -from dltest.utils.iluvatar import get_iluvatar_card_type, IluvatarGPU - - - - -def parse_target(target): - result = {} - targets = target.split(",") - for i in targets: - item = i.split(":") - assert len(item) == 2 - key, value = item - result[key] = float(value) - return result - - -def load_json(file): - file_path = expanduser(file) - # 检查文件是否存在 - if exists(file_path): - # 加载json文件 - with open(file_path, 'r') as file: - data = json.load(file) - else: - # 创建一个空的json文件 - data = {} - - return data - -def process_results(results): - result = dict() - for i in results["results"]: - for k, v in i.items(): - result[k] = v[0] - return result - -class FetchLog(LogParserCLI): - - def command_name(self): - return "fetch" - - def predefine_args(self): - super(FetchLog, self).predefine_args() - self.parser.add_argument('log', nargs='?', type=str, help="Log path") - self.parser.add_argument('--saved', type=str, default=None, help='Save to path') - self.parser.add_argument('--saved_entry', type=str, default=None, help='Save to path') - self.parser.add_argument('-t_bi150','--target_bi150', type=str, default=-1.) - self.parser.add_argument('-t_mr100','--target_mr100', type=str, default=-1.) - self.parser.add_argument('-t_mr50','--target_mr50', type=str, default=-1.) - - def run(self): - args = self.parse_args() - parser = LogParser( - patterns=args.patterns, pattern_names=args.pattern_names, - use_re=args.use_re, nearest_distance=args.nearest_distance, - start_line_pattern_flag=args.start_flag, - end_line_pattern_flag=args.end_flag, - split_pattern=args.split_pattern, - split_sep=args.split_sep, - split_idx=args.split_idx - ) - - results = parser.parse(args.log) - if not isinstance(results, Mapping): - results = dict(results=results) - results = process_results(results) - print(results) - - if args.saved is not None: - saved = load_json(args.saved) - if not args.saved_entry: - raise Exception("You need to use --saved_entry to specify entry name of the result") - - saved[args.saved_entry] = results - with open(args.saved, 'w') as f: - json.dump(saved, f, indent=4) - self.compare_results(args, results) - - - def compare_results(self, args, results): - card = get_iluvatar_card_type() - if card == IluvatarGPU.UNKNOWN: - print("Not known which card is used, can you use ixsmi in the environment?") - return - user_target = getattr(args, 'target_'+card.name.lower(), "") - user_target = parse_target(user_target) - - is_expected = True - for key, target in user_target.items(): - if key not in results: - continue - if results[key]={target}") - if not is_expected: - sys.exit(1) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py deleted file mode 100644 index cac8a0a684440371ece5067086cd75eed939f482..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import json -from pprint import pprint - -from dltest.cli.log_parser_cli import LogParserCLI -from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS - - -class LogComparatorCLI(LogParserCLI): - - def command_name(self): - return "compare" - - def predefine_args(self): - super(LogComparatorCLI, self).predefine_args() - self.parser.add_argument('--log1', type=str, help="First log") - self.parser.add_argument('--log2', type=str, help="Second log") - self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold") - self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare') - self.parser.add_argument('--saved', type=str, default=None, help='Save to path') - self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result') - self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2') - - def parse_args(self, *args, **kwargs): - args = super(LogComparatorCLI, self).parse_args(*args, **kwargs) - args.only_last = args.only_last >= 1 - - return args - - def run(self): - args = self.parse_args() - satisfied, results = compare_logs_with_paths( - log1=args.log1, log2=args.log2, - threshold=args.threshold, - patterns=args.patterns, pattern_names=args.pattern_names, - use_re=args.use_re, nearest_distance=args.nearest_distance, - start_line_pattern_flag=args.start_flag, - end_line_pattern_flag=args.end_flag, - only_last=args.only_last, - split_pattern=args.split_pattern, - split_sep=args.split_sep, - split_idx=args.split_idx, - allow_greater_than=True - ) - - if args.print_result: - pprint(results) - - if satisfied: - print("SUCCESS") - else: - print("FAIL") - - if args.saved is not None: - with open(args.saved, 'w') as f: - json.dump(results, f) - - - - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py deleted file mode 100644 index d2e2dd1be2d305a83a2969b5d4dbfbfeef2d9fd0..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import json -from typing import Mapping - -from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS -from dltest.utils.base_cli import BaseCLI - - -class LogParserCLI(BaseCLI): - - def predefine_args(self): - self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns') - self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern') - self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression') - self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern') - self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log') - self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log') - self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line') - self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line') - self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line') - - def parse_args(self, *args, **kwargs): - args = super(LogParserCLI, self).parse_args(*args, **kwargs) - - return args - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py deleted file mode 100644 index 8d0d77d97d8f4f0d4d3528418c886884fa262575..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import json -import os -import os.path as ospath -from pprint import pprint -from typing import List, Union - -from dltest.utils.base_cli import BaseCLI -from dltest.utils.get_env import get_gpu_type -from dltest.utils.misc import get_full_path -from dltest.model_compare_config import get_compare_config_with_full_path -from dltest.log_comparator import compare_logs_with_paths -from dltest.utils.subprocess_tools import get_output - - -REMAINDER = '...' - - -class ModelValidatorCLI(BaseCLI): - - def command_name(self): - return "validate" - - def predefine_args(self): - super(ModelValidatorCLI, self).predefine_args() - self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log") - self.parser.add_argument('--saved', type=str, default=None, help='Save to path') - self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared") - self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result') - self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output') - self.parser.add_argument("run_script", nargs=REMAINDER) - - def parse_args(self, *args, **kwargs): - args = super(ModelValidatorCLI, self).parse_args() - if len(args.run_script) == 0: - print("ERROR: Invalid run_script") - exit(1) - - return args - - def run(self): - args = self.parse_args() - output = self._run_script(args.run_script, capture_output_method=args.capture_output) - self.compare_logs( - output, args.compare_log, args.run_script, - args.saved, args.with_exit_code, - args.print_result - ) - - def compare_logs(self, output: List, compare_log: str, - run_script: List[str], saved: str=None, - with_exit_code: int=1, print_result=False): - script_path = self._get_script_path(run_script) - script_path = get_full_path(script_path) - compare_args = get_compare_config_with_full_path(script_path) - - if compare_log is None: - epoch = self._get_epoch(run_script) - script_name = ospath.basename(script_path) - dist_tag = self._get_dist_tag(script_name) - compare_log = self._find_comparable_log(script_path, epoch, dist_tag) - - if not ospath.exists(compare_log): - print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.") - exit(1) - - compare_args['log1'] = output - compare_args['log2'] = compare_log - - satisfied, results = compare_logs_with_paths(**compare_args) - - if print_result: - pprint(results) - - if satisfied: - print("SUCCESS") - else: - print("FAIL") - - if saved is not None: - with open(saved, 'w') as f: - json.dump(results, f) - - if with_exit_code: - if satisfied: - exit(0) - else: - exit(1) - - def _run_script(self, command: List, capture_output_method: str='tempfile'): - return get_output(command, capture_output_method=capture_output_method) - - def _get_script_path(self, run_script: List[str]): - for i, field in enumerate(run_script): - if field.endswith('.py') or field.endswith('.sh'): - return field - - raise RuntimeError("Not found the name of script, " + - "only support python or `sh` script, but got {}.".format(run_script)) - - def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str): - gpu_type = get_gpu_type().lower() - - # Get the platform of trained log - if gpu_type == "nv": - gpu_type = 'bi' - else: - gpu_type = 'nv' - - script_path = get_full_path(script_path) - project_dir = self._get_project_dir(script_path) - script_name = ospath.basename(script_path) - - log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log" - - return log_path - - - def _get_epoch(self, run_script: List[str]): - for i, field in enumerate(run_script): - if "--epoch" in field: - if "=" in field: - return field.split("=")[1] - else: - return run_script[i + 1] - - return 'default' - - def _get_dist_tag(self, script_name: str): - try: - import torch - num_gpus = torch.cuda.device_count() - except: - num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all") - - if '_dist_' in script_name or '_multigpu_' in script_name: - return f".{num_gpus}card" - return "" - - def _get_project_dir(self, abs_path): - abs_path = ospath.abspath(abs_path) - script_dir = ospath.dirname(abs_path) - executables_dir = ospath.dirname(script_dir) - return ospath.dirname(executables_dir) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py deleted file mode 100644 index 9da2c0cd579a3407b6d743bfd2a4cdbbd28a687c..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from typing import List, Mapping, Union, Tuple -from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS - -LogLines = List[Mapping] -CompareResult = Tuple[bool, Union[List, Mapping]] - - -def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult: - if not isinstance(threshold, Mapping): - _thds = dict() - for key in value1.keys(): - _thds[key] = threshold - threshold = _thds - - result = dict() - satisfied = True - for key, _thd in threshold.items(): - v1, v2 = value1[key], value2[key] - origin_value_type = list - if not isinstance(v1, (tuple, list)): - origin_value_type = float - v1 = [v1] - v2 = [v2] - - real_errors = [] - for v1_i, v2_i in zip(v1, v2): - real_error = v1_i - v2_i - real_errors.append(real_error) - if satisfied and abs(real_error) > _thd: - if allow_greater_than and real_error > 0: - continue - satisfied = False - - if origin_value_type is float and len(real_errors) > 0: - real_errors = real_errors[0] - - result[key] = real_errors - - return satisfied, result - - -def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult: - total_lines = len(log1[0]) - real_errors = [] - satisfied = True - for line_idx in range(total_lines): - _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than) - real_errors.append(_error) - if satisfied and not _satisfied: - satisfied = False - - return satisfied, real_errors - - -def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult: - if len(log1) == 0 or len(log2) == 0: - return False, [] - return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than) - - -def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping], - patterns: List[str], - pattern_names: List[str] = None, - use_re: bool = False, - nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS, - start_line_pattern_flag: str = None, - end_line_pattern_flag: str = None, - only_last: bool=True, - split_pattern: Union[str, List] = None, - split_sep: List = None, - split_idx: List = None, - allow_greater_than: bool = False): - parser = LogParser( - patterns=patterns, pattern_names=pattern_names, - use_re=use_re, nearest_distance=nearest_distance, - start_line_pattern_flag=start_line_pattern_flag, - end_line_pattern_flag=end_line_pattern_flag, - split_pattern=split_pattern, - split_sep=split_sep, - split_idx=split_idx - ) - - log1 = parser.parse(log1) - log2 = parser.parse(log2) - - if only_last: - compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than) - else: - compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than) - - return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1]) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py deleted file mode 100644 index 3c690d8f677b3ae470322e29c266e84993a74266..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from typing import List, Optional, Union, Mapping -import re -import sys - - -DEFAULT_NEAREST_MATCH_CHARS = 10 - - -def read_file(file): - with open(file, 'r') as f: - return f.readlines() - -def read_pipe(): - result = [] - for line in sys.stdin: - result.append(line) - return result - -def postprocess_search_result(results: List[str]) -> List[float]: - if len(results) != 0: - results = list(map(float, results)) - return results - - -def extract_nearest_value_by_key_inline(content: str, key: str, - nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]: - pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance) - return extract_value_by_pattern_inline(content, pattern) - - -def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]: - results = re.findall(pattern, content) - return postprocess_search_result(results) - - -def extract_value(content: str, pattern: str, - inline=True, use_re=False, - nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]: - if inline: - if use_re: - return extract_value_by_pattern_inline(content, pattern) - else: - return extract_nearest_value_by_key_inline(content, pattern, nearest_distance) - else: - raise NotImplementedError() - - -class LogParser: - - def __init__(self, - patterns: List[str]=None, - pattern_names: List[str]=None, - use_re: bool=False, - nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS, - start_line_pattern_flag: str=None, - end_line_pattern_flag: str=None, - split_pattern: Union[str, List]=None, - split_sep: List[str]=None, - split_idx: List[int]=None): - if patterns is None and split_sep is None: - raise ValueError("The one of argument `patterns` or `split_sep` must be given.") - - if pattern_names is not None: - if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names): - raise ValueError("The length of `pattern_names` argument not equal to `patterns`.") - if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names): - raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.") - - if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))): - raise ValueError("Invalid index to split text, got {}.".format(split_idx)) - - if split_sep is not None and split_pattern is None: - raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern)) - - self.patterns = patterns - self.use_re = use_re - self.nearest_distance = nearest_distance - self.start_line_pattern_flag = start_line_pattern_flag - self.end_line_pattern_flag = end_line_pattern_flag - - if not isinstance(split_sep, (tuple, list)) and split_sep is not None: - split_sep = [split_sep] - - if not isinstance(split_idx, (tuple, list)): - split_idx = [split_idx] - - self.split_sep = split_sep - self.split_idx = split_idx - - if pattern_names is None: - if patterns is None: - pattern_names = split_idx - else: - pattern_names = patterns - self.pattern_names = pattern_names - - if not isinstance(split_pattern, (tuple, list)) and split_sep is not None: - split_pattern = [split_pattern] * len(split_sep) - self.split_pattern = split_pattern - - self.start_record = start_line_pattern_flag is None - - def parse(self, path_or_logs: Union[str, List]) -> List[dict]: - """ - : return: [{matric_name: value}, ...] - """ - - - if path_or_logs: - path_or_logs = read_file(path_or_logs) - else: - path_or_logs = read_pipe() - - ret = [] - for line in path_or_logs: - result = self.parse_inline(line) - if len(result) == 0: - continue - ret.append(result) - return ret - - def parse_inline(self, line) -> dict: - if not self.can_record(line): - return {} - - if self.split_sep is None: - return self._parse_inline_by_match(line) - return self._parse_inline_by_split(line) - - def _parse_inline_by_match(self, line: str): - ret = {} - for name, pattern in zip(self.pattern_names, self.patterns): - result = extract_value( - line, pattern, inline=True, use_re=self.use_re, - nearest_distance=self.nearest_distance - ) - if len(result) == 0: - continue - ret[name] = result - return ret - - def _parse_inline_by_split(self, line: str, to_type=float): - ret = {} - for name, sep, idx, pattern in zip(self.pattern_names, - self.split_sep, - self.split_idx, - self.split_pattern): - if not self.can_matched(line, pattern): - continue - if '\t' in sep: - segs = line.strip().split(sep) - else: - segs = line.strip().replace('\t', ' ').split(sep) - segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs)) - if len(segs) <= idx: - continue - ret[name] = to_type(segs[idx]) - return ret - - def can_record(self, line: str): - if self.start_line_pattern_flag is None: - self.start_record = True - elif not self.start_record: - self.start_record = self.can_matched(line, self.start_line_pattern_flag) - - if self.start_record: - if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag): - self.start_record = False - - return self.start_record - - def can_matched(self, content: str, pattern: str): - result = re.findall(pattern, content) - return len(result) != 0 - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py deleted file mode 100644 index ab7c60d3a6f0758bdac30b12fe82c83dab6cd520..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import os.path as ospath - -from typing import NamedTuple, Union, List, Mapping - -from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS - - -class LogComparatorArgs(NamedTuple): - threshold: Union[float, Mapping] - patterns: List[str] = None - pattern_names: List[str] = None - use_re: bool = False - nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS - start_line_pattern_flag: str = None - end_line_pattern_flag: str = None - split_pattern: Union[str, List] = None - split_sep: List = None - split_idx: List = None - only_last: bool = True - allow_greater_than: bool = True - - def to_dict(self): - return self._asdict() - - -class ArgsModelsTuple(NamedTuple): - - args: LogComparatorArgs - models: List[str] - - -class BaseConfig: - - def __getitem__(self, item): - return self.__class__.__dict__[item] - - def __getattr__(self, item): - return self.__class__.__dict__[item] - - def __iter__(self): - for attr, value in self.__class__.__dict__.items(): - if isinstance(value, ArgsModelsTuple): - yield attr - - def iter_items(self): - for attr, value in self.__class__.__dict__.items(): - if isinstance(value, ArgsModelsTuple): - yield attr, value - - -class _TFComparatorConfig(BaseConfig): - - cnn_benchmarks = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="], - pattern_names=["Acc@1", "Acc@5"] - ), - models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"] - ) - - dist_cnn_becnmarks = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - split_sep=[' ', ' '], - split_idx=[9, 10], - split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter", - pattern_names=['Acc@1', 'Acc@5'] - ), - models=[ - "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist" - ] - ) - - bert = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=["eval_accuracy ="], - pattern_names=["Accuracy"] - ), - models=["bert"] - ) - - ssd = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=["acc="], - pattern_names=["Acc@1"] - ), - models=["ssd"] - ) - - yolov3 = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.8, - patterns=["mAP"] - ), - models=["yolov3"] - ) - - vnet = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=["background_dice", "anterior_dice", "posterior_dice"] - ), - models=["vnet"] - ) - - -class _TorchComparatorConfig(BaseConfig): - classification = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=8.0, patterns=['Acc@1', 'Acc@5'], - start_line_pattern_flag="Start training", - ), - models=[ - 'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2', - 'vgg', 'resnet50_dali', 'resnext', 'densenet' - ] - ) - - detection = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.03, - patterns=[ - "Average Precision \(AP\) @\[ IoU=0.50:0.95 \| area= all \| maxDets=100 \] =" - ], - pattern_names=["mAP"], - start_line_pattern_flag="IoU metric: bbox", - end_line_pattern_flag="IoU metric: segm" - ), - models=[ - 'maskrcnn', 'retinanet', 'ssd' - ] - ) - - bert_cola = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=['mcc'] - ), - models=['bert_cola'] - ) - - bert_mrpc = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=['acc'] - ), - models=['bert_mrpc'] - ) - - bert_pretrain_apex = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=['eval_mlm_accaracy'] - ), - models=['bert_pretrain_apex'] - ) - - segmentation = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=8.0, - patterns=['mean IoU:'], - pattern_names=['mIoU'] - ), - models=[ - 'deeplabv3', 'fcn' - ] - ) - - t5 = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=5.0, - split_pattern="eval_bleu[\s\S]*?=", - split_sep=["="], - split_idx=[1], - pattern_names=['EvalBleu'] - ), - models=['t5'] - ) - - yolov3 = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=["mAP"] - ), - models=['yolov3'] - ) - - yolov5 = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - patterns=[ - "Average Precision \(AP\) @\[ IoU=0.50:0.95 \| area= all \| maxDets=100 \] =" - ], - pattern_names=["mAP"], - ), - models=['yolov5'], - ) - - yolov5s_coco128 = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*", - split_sep=[" ", " "], - split_idx=[5, 6], - pattern_names=["AP50", "mAP"] - ), - models=['yolov5s_coco128'] - ) - - centernet_resnet18 = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*", - split_sep=[" ", " "], - split_idx=[5, 6], - pattern_names=["AP50", "mAP"] - ), - models=['centernet_resnet18'] - ) - - fcos_resnet50_fpn = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.08, - split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*", - split_sep=[" ", " "], - split_idx=[5, 6], - pattern_names=["AP50", "mAP"] - ), - models=['fcos_resnet50_fpn'] - ) - - ocr_recognition = ArgsModelsTuple( - args=LogComparatorArgs( - threshold=0.5, patterns=["0_word_acc"], - ), - models=[ - "sar", "satrn" - ] - ) - - - -class ComparatorConfig: - - _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig()) - - @classmethod - def get_frameworks(cls) -> List: - return list(cls._configs.keys()) - - @classmethod - def get(cls, tf_or_torch, name, default=None): - for model_kind, comb in cls._configs[tf_or_torch].iter_items(): - if name in comb.models: - return comb.args - if default is not None: - return default - raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch)) - - @classmethod - def find_config(cls, script_path: str) -> LogComparatorArgs: - tf_or_torch = script_path.split('.')[-2].split('_')[-1] - - # Find by the name of script - script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0] - if script_name.startswith('train_'): - script_name = script_name.replace("train_", "", 1) - while script_name not in [None, "", "/", "\\"]: - try: - config = cls.get(tf_or_torch, script_name) - return config - except: - pass - script_name = script_name.rsplit('_', maxsplit=1) - if len(script_name) <= 1: - break - script_name = script_name[0] - - # Find by the name of model's dir - model_dir_name = ospath.basename(ospath.dirname(script_path)) - try: - config = cls.get(tf_or_torch, model_dir_name) - return config - except: - raise RuntimeError("Not found for", script_path) - - -def get_compare_config_with_full_path(script_path: str, to_dict=True): - config = ComparatorConfig.find_config(script_path) - if to_dict: - return config.to_dict() - return config - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py deleted file mode 100644 index 35f7efa99b21179da30ce34f412fa3319ea1ba00..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from argparse import ArgumentParser -from abc import abstractmethod - - -class BaseCLI: - - def __init__(self, parser=None, *args, **kwargs): - if parser is None: - self.parser = ArgumentParser(description=self.description ,*args, **kwargs) - - def __call__(self): - self.run() - - @property - def description(self): - return None - - @abstractmethod - def command_name(self): - pass - - def predefine_args(self): - pass - - def parse_args(self, *args, **kwargs): - self.predefine_args() - return self.parser.parse_args(*args, **kwargs) - - @abstractmethod - def run(self): - pass - - - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py deleted file mode 100644 index 97407f37bd9d8a4c5e0a68c760a561ec03a29f95..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import os -from collections import defaultdict -import os.path as osp -import subprocess -import sys - - -def get_envinfo(): - import torch - env_info = {} - env_info['sys.platform'] = sys.platform - env_info['Python'] = sys.version.replace('\n', '') - - cuda_available = torch.cuda.is_available() - env_info['CUDA available'] = cuda_available - if cuda_available: - from torch.utils.cpp_extension import CUDA_HOME - env_info['CUDA_HOME'] = CUDA_HOME - if CUDA_HOME is not None and osp.isdir(CUDA_HOME): - try: - nvcc = osp.join(CUDA_HOME, 'bin/nvcc') - nvcc = subprocess.check_output( - f'"{nvcc}" -V | tail -n1', shell=True) - nvcc = nvcc.decode('utf-8').strip() - except subprocess.SubprocessError: - nvcc = 'Not Available' - env_info['NVCC'] = nvcc - - devices = defaultdict(list) - for k in range(torch.cuda.device_count()): - devices[torch.cuda.get_device_name(k)].append(str(k)) - for name, devids in devices.items(): - env_info['GPU ' + ','.join(devids)] = name - - gcc = subprocess.check_output('gcc --version | head -n1', shell=True) - gcc = gcc.decode('utf-8').strip() - env_info['GCC'] = gcc - - env_info['PyTorch'] = torch.__version__ - - return env_info - - -def get_gpu_type(): - import torch - if "DEBUG_GPU_TYPE" in os.environ: - return os.environ["DEBUG_GPU_TYPE"] - - if not torch.cuda.is_available(): - return "BI" - dev_name = torch.cuda.get_device_name(0) - if 'IX BI' in dev_name or getattr(torch, "corex", False): - _type = "BI" - else: - _type = "NV" - - return _type diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py deleted file mode 100644 index 7328dd737c2720d544027ad1822d3c2007656a8e..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import sys -import subprocess -from enum import Enum - -__all__ = ["get_iluvatar_card_type", "IluvatarGPU"] - -class IluvatarGPU(Enum): - UNKNOWN = -1 - MR50 = 0 - MR100 = 1 - BI150 = 2 - -card_ixsmi_names = { - "BI150": IluvatarGPU.BI150, - "BI-V150": IluvatarGPU.BI150, - "MR100": IluvatarGPU.MR100, - "MR-V100": IluvatarGPU.MR100, - "MR50": IluvatarGPU.MR50, - "MR-V50": IluvatarGPU.MR50, -} - -def get_iluvatar_card_type(): - command = 'ixsmi -L | grep "GPU \{1,\}0"' - result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode == 0: - for key, value in card_ixsmi_names.items(): - if key in result.stdout: - return value - else: - return IluvatarGPU.UNKNOWN - else: - return IluvatarGPU.UNKNOWN diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py deleted file mode 100644 index 29760001cab2d9a8cbeecc894e9e3344ad00d2b4..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import os - -from typing import Union, List, Dict, Any, Mapping -from argparse import Namespace, ArgumentParser -import json - - -def _obj_to_dict(obj) -> Dict: - if isinstance(obj, Mapping): - return obj - - try: - from absl import flags - if isinstance(obj, flags.FlagValues): - return obj.flag_values_dict() - except: - pass - if isinstance(obj, Namespace): - return obj.__dict__ - elif isinstance(obj, List): - new_obj = dict() - for _o in obj: - _o_dict = _obj_to_dict(_o) - new_obj.update(_o_dict) - return new_obj - elif not isinstance(obj, Dict): - if hasattr(obj, "__dict__"): - return obj.__dict__ - try: - typename = type(obj).__name__ - except: - typename = str(obj) - return {typename: str(obj)} - - -def json_dump_obj(o): - if hasattr(o, "__name__"): - return o.__name__ - return str(o) - - -def show_infer_arguments(args: Union[List, Dict, Any]): - """ print running arguments - Example 1: For ArgumentParser - >>> parser = ArgumentParser("Test") - >>> parser.add_argument("--arg0", type=str) - >>> args = parser.parse_args() - >>> show_infer_arguments(args) - - Example 2: For dict - >>> args = dict(arg=1) - >>> show_infer_arguments(args) - - Example 3: For custom object - >>> from collections import namedtuple - >>> ArgsType = namedtuple("ArgsType", ["arg"]) - >>> args = ArgsType(arg=123) - >>> show_infer_arguments(args) - - Example 4: For absl - >>> from absl import flags - >>> flags.DEFINE_string("arg", "123", "test") - >>> show_infer_arguments(flags.FLAGS) - - Example 5: For multi args - >>> args1 = dict(a=1) - >>> args2 = dict(b=2) - >>> show_infer_arguments([args1, args2]) - - """ - if not "SHOW_RUNNING_ARGS" in os.environ: - return - - if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]: - return - - if "LOCAL_RANK" in os.environ: - if os.environ["LOCAL_RANK"] != "0": - return - args = _obj_to_dict(args) - args = json.dumps(args, default=json_dump_obj) - print("[RunningArguments]", args) - - -if __name__ == '__main__': - os.environ["SHOW_RUNNING_ARGS"] = "1" - show_infer_arguments([dict(a=1), dict(b=1), object()]) \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py deleted file mode 100644 index 457bdb3ee2aab7d98faa5567856e8fa923589e0a..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import copy -import os - - -def get_full_path(fname): - pwd = os.getcwd() - if fname.startswith('/'): - return fname - return os.path.join(pwd, fname) - - -def is_main_proc(rank): - return str(rank) in ["None", "-1", "0"] - - -def main_proc_print(*args, **kwargs): - if "RANK" in os.environ: - if is_main_proc(os.environ["RANK"]): - print(*args, **kwargs) - return - - if "LOCAL_RANK" in os.environ: - if is_main_proc(os.environ["LOCAL_RANK"]): - print(*args, **kwargs) - return - - print(*args, **kwargs) - - -def create_subproc_env(): - env = copy.copy(os.environ) - env["USE_DLTEST"] = "1" - return env \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py deleted file mode 100644 index a9883213f4f44d8253986e91c64f4015c66d6ec4..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import os -import os.path as ospath -from pathlib import Path -import tempfile - - -class TemporaryFile: - - def __init__(self, with_open=False, mode='r'): - self.name = None - self.with_open = with_open - self.mode = mode - - self.file = None - - def create(self): - self.name = tempfile.mktemp() - file_path = Path(self.name) - file_path.touch() - - def delete(self): - if self.name is not None and ospath.exists(self.name): - os.unlink(self.name) - - def read(self): - self._check_file_status() - return self.file.read() - - def readlines(self): - self._check_file_status() - return self.file.readlines() - - def _check_file_status(self): - if self.file is None: - raise RuntimeError("File is closed, please reopen it.") - - def __enter__(self): - self.create() - if self.with_open: - self.file = open(self.name, mode=self.mode) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.with_open: - self.file.close() - self.delete() - - - - - - - - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py deleted file mode 100644 index 8c5de879b0470d29e208368f1681df8469dcf488..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import subprocess -from typing import Callable, Union, List - -from dltest.utils.real_tempfile import TemporaryFile -from dltest.utils import misc - - -def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs): - if shell is None: - shell = True - - if shell and not isinstance(command, str): - command = " ".join(command) - - stream = subprocess.Popen( - command, shell=shell, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - *args, **kwargs - ) - outputs = [] - while 1: - exit_code = stream.poll() - if exit_code is None: - if stream.stdout.readable(): - outputs.append(stream.stdout.readline().decode("utf8").rstrip()) - if callback is not None: - callback(outputs[-1:]) - print(outputs[-1]) - else: - if stream.stdout.readable(): - lines = stream.stdout.readlines() - lines = [line.decode("utf8".rstrip()) for line in lines] - outputs.extend(lines) - if callback is not None: - callback(outputs[-1:]) - print('\n'.join(lines)) - break - - return outputs - - -def get_output_with_tempfile(command, *args, **kwargs): - if not isinstance(command, (list, tuple)): - command = [command] - stdout = None - with TemporaryFile(with_open=True) as file: - command.extend(['|', 'tee', file.name]) - command = " ".join(command) - - res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs) - output = file.readlines() - - return output - -def execute_shell(command, *args, **kwargs): - if "env" not in kwargs: - kwargs["env"] = misc.create_subproc_env() - - if not isinstance(command, (list, tuple)): - command = [command] - - command = " ".join(command) - res = subprocess.run(command, - shell=True, *args, **kwargs) - return res - -def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs): - if "env" not in kwargs: - kwargs["env"] = misc.create_subproc_env() - - if capture_output_method == "tempfile": - return get_output_with_tempfile(command, *args, **kwargs) - return get_output_with_pipe(command, *args, **kwargs) \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py deleted file mode 100644 index 2e4fa4eea09fa2cdf51b02619d56fe5fcced869f..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from setuptools import setup, find_packages -from dltest.cli.entry_points import make_execute_path - -setup( - name="dltest", - version="0.1", - description='Iluvatar Corex AI Toolbox', - packages=find_packages(exclude=('examples')), - include_package_data=True, - zip_safe=False, - entry_points = { - 'console_scripts': make_execute_path(), - }, - install_requires=[ - 'psutil' - ] -) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py deleted file mode 100644 index 3a9c0ca081a1b44c00b0909c2b69c0e5a00c1e6a..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py +++ /dev/null @@ -1,593 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -import onnx -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper -from passes.fuse_series_bias_add import FusionSerialBiasAdd -from passes.fusion_albert_attention import FusionAlbertAttention -from passes.fusion_attention import AttentionMask, FusionAttention -from passes.fusion_biasgelu import FusionBiasGelu -from passes.fusion_customfc import ( - FusionCustomFC, - FusionCustomFCActivation, - FusionCustomFCGPT2, -) -from passes.fusion_disentangled_attention import FusionDisentangledAttention -from passes.fusion_embedlayer import FusionEmbedLayerNormalization -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_format_roformer import ( - FusionFormatInvalidMask, - FusionRemoveUselessElementwise, -) -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast -from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_qordered_matmul import FusionQOrderedMatMul -from passes.fusion_reshape import FusionReshape -from passes.fusion_shape import FusionShape -from passes.fusion_skiplayernorm import ( - FusionBiasSkipLayerNormalization, - FusionSkipLayerNormalization, -) - -from passes.fusion_utils import FusionUtils - -from passes.fusion_conv_reformat import FusionConvReformat - -from passes.fusion_xsoftmax import FusionXSoftmax -from passes.fusion_PVT_attention import FusionPVTAttention -from passes.onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class PVTOptimizationOptions(FusionOptions): - """This class is deprecated""" - - def __init__(self, model_type): - logger.warning( - f"PVTOptimizationOptions is depreciated. Please use FusionOptions instead." - ) - super().__init__(model_type) - - -class PVTOnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): - """Initialize BERT ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - - super().__init__(model) - self.num_heads = num_heads - self.hidden_size = hidden_size - - self.attention_mask = AttentionMask(self) - self.attention_fusion = FusionAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.qordered_attention_fusion = FusionQOrderedAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.utils = FusionUtils(self) - - def fuse_attention(self): - self.attention_fusion.apply() - FusionAlbertAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ).apply() - # FusionVideoBertAttention(self).apply() - # FusionVITAttention(self).apply() - # FusionSwinLAttention(self).apply() - # FusionGptAttentionNoPast(self).apply() - FusionPVTAttention(self).apply() - # Only relevant in models with Q-DQ nodes - self.qordered_attention_fusion.apply() - - def fuse_format_roformer(self): - FusionRemoveUselessElementwise(self).apply() - fusion = FusionFormatInvalidMask(self) - fusion.apply() - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_activation(self): - fusion = FusionCustomFCActivation(self) - fusion.apply() - - def fuse_custom_fc_gpt2_classify(self): - fusion = FusionCustomFCGPT2(self) - fusion.apply() - - def fuse_swinT_serial_bias_add(self): - fusion = FusionSerialBiasAdd(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_bias_gelu(self, is_fastgelu): - fusion = FusionBiasGelu(self, is_fastgelu) - fusion.apply() - - def fuse_custom_xsoftmax(self): - fusion = FusionXSoftmax(self) - fusion.apply() - - def fuse_disentangled_attention(self): - fusion = FusionDisentangledAttention(self) - fusion.apply() - - def gelu_approximation(self): - fusion = FusionGeluApproximation(self) - fusion.apply() - - def fuse_add_bias_skip_layer_norm(self): - fusion = FusionBiasSkipLayerNormalization(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_embed_layer(self): - fusion = FusionEmbedLayerNormalization(self) - fusion.apply() - - def fuse_layer_norm(self): - fusion = FusionLayerNormalization(self, self.hidden_size) - fusion.apply() - - fusion = FusionLayerNormalizationTF(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedLayerNormalization(self) - fusion.apply() - - def fuse_skip_layer_norm(self): - fusion = FusionSkipLayerNormalization(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - def fuse_qordered_mamtul(self): - fusion = FusionQOrderedMatMul(self) - fusion.apply() - - def conv_reformat(self): - fusion = FusionConvReformat(self) - fusion.apply() - - - - def get_graph_inputs_from_node_type( - self, op_type: str, input_indices: List[int], casted: bool - ): - """ - Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). - Returns a list of the graph input names based on the filter whether it is casted or not. - """ - graph_inputs = [] - - output_name_to_node = self.output_name_to_node() - nodes = self.get_nodes_by_op_type(op_type) - for node in nodes: - bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)] - for bert_input in bert_inputs: - if self.find_graph_input(bert_input): - if not casted: - graph_inputs.append(bert_input) - elif bert_input in output_name_to_node: - parent = output_name_to_node[bert_input] - if ( - parent.op_type == "Cast" - and self.find_graph_input(parent.input[0]) is not None - ): - if casted: - graph_inputs.append(parent.input[0]) - return graph_inputs - - def get_graph_inputs_from_fused_nodes(self, casted: bool): - inputs = self.get_graph_inputs_from_node_type( - "EmbedLayerNormalization", [0, 1, 7], casted - ) - inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) - return inputs - - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - - def change_graph_inputs_to_int32(self): - """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" - graph = self.graph() - add_cast_count = 0 - remove_cast_count = 0 - for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type( - graph, graph_input, TensorProto.INT32 - ) - if new_node: - add_cast_count += 1 - remove_cast_count += len(removed_nodes) - logger.info( - f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes." - ) - - def use_dynamic_axes( - self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len" - ): - """ - Update input and output shape to use dynamic axes. - """ - bert_graph_inputs = self.get_graph_inputs_from_fused_nodes( - casted=True - ) + self.get_graph_inputs_from_fused_nodes(casted=False) - - dynamic_batch_inputs = {} - for input in self.model.graph.input: - if input.name in bert_graph_inputs: - dim_proto = input.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - if dynamic_seq_len is not None: - dim_proto = input.type.tensor_type.shape.dim[1] - dim_proto.dim_param = dynamic_seq_len - - for output in self.model.graph.output: - dim_proto = output.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - - def preprocess(self): - self.adjust_reshape_and_expand() - return - - def adjust_reshape_and_expand(self): - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Reshape": - # Clean up unneccessary reshape nodes. - # Find reshape nodes with no actually data in "shape" attribute and remove. - reshape_shape = self.get_constant_value(node.input[1]) - if reshape_shape is not None and reshape_shape.size == 0: - nodes_to_remove.extend([node]) - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - continue - - # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by - # changing current reshape's input to output of slice. - reshape_path = self.match_parent_path( - node, - ["Expand", "Expand", "Reshape", "Slice"], - [0, 0, 0, 0], - self.output_name_to_node(), - ) - if reshape_path is not None: - expand_node = reshape_path[-3] - expand_shape_value = self.get_constant_value(expand_node.input[1]) - - reshape_before_expand = reshape_path[-2] - shape_value = self.get_constant_value( - reshape_before_expand.input[1] - ) - - slice_node = reshape_path[-1] - if ( - expand_shape_value is not None - and shape_value is not None - and len(expand_shape_value) == 2 - and len(shape_value) == 1 - and expand_shape_value[1] == shape_value[0] - ): - node.input[0] = slice_node.output[0] - - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}") - - def clean_graph(self): - output_name_to_node = self.output_name_to_node() - nodes_to_remove = [] - for node in self.nodes(): - # Before: - # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+ - # | | - # | v - # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # After: - # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value) - op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3} - if node.op_type in op_input_id: - i = op_input_id[node.op_type] - parent_nodes = self.match_parent_path( - node, - [ - "Cast", - "ConstantOfShape", - "Concat", - "Unsqueeze", - "Gather", - "Shape", - ], - [i, 0, 0, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - ( - cast, - constantOfShape, - concat, - unsqueeze, - gather, - shape, - ) = parent_nodes - if shape.input[0] == self.graph().input[0].name: - constantOfShape.input[0] = shape.output[0] - output_name_to_node = self.output_name_to_node() - - if node.op_type == "Attention": - # Before: - # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention - # After: - # remove this path, and remove the optional mask_index input of Attention node. - parent_nodes = self.match_parent_path( - node, - ["ReduceSum", "Cast", "ConstantOfShape", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - if parent_nodes[-1].input[0] == self.graph().input[0].name: - attention_node = helper.make_node( - "Attention", - inputs=node.input[0 : len(node.input) - 1], - outputs=node.output, - name=node.name + "_remove_mask", - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [helper.make_attribute("num_heads", self.num_heads)] - ) - self.add_node( - attention_node, self.get_graph_by_node(attention_node).name - ) - nodes_to_remove.append(node) - self.remove_nodes(nodes_to_remove) - - def postprocess(self): - self.clean_graph() - self.prune_graph() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.preprocess() - - self.fuse_reshape() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - if options.enable_swint_opt: - self.fuse_custom_fc() - self.fuse_swinT_serial_bias_add() - - if options.enable_format_roformer: - self.fuse_format_roformer() - - if options.enable_gpt2_classify or options.enable_vit: - self.fuse_custom_fc_gpt2_classify() - - if options.enable_vit: - self.fuse_custom_fc() - - # if (options is None) or options.enable_attention: - # if options is not None: - # self.attention_mask.set_mask_format(options.attention_mask_format) - self.fuse_attention() - - self.conv_reformat() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - self.fuse_custom_fc() - - self.fuse_custom_xsoftmax() - - self.fuse_disentangled_attention() - - # Perform the MatMul fusion after the Attention fusion as we do not - # want to fuse the MatMuls inside the Attention subgraphs - if (options is None) or options.enable_qordered_matmul: - self.fuse_qordered_mamtul() - - self.fuse_shape() - - if (options is None) or options.enable_embed_layer_norm: - self.fuse_embed_layer() - - # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. - self.utils.remove_useless_reshape_nodes() - - self.postprocess() - - # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization - if (options is None) or options.enable_bias_gelu: - # Fuse Gelu and Add Bias before it. - self.fuse_bias_gelu(is_fastgelu=True) - self.fuse_bias_gelu(is_fastgelu=False) - - if (options is None) or options.enable_bias_skip_layer_norm: - # Fuse SkipLayerNormalization and Add Bias before it. - self.fuse_add_bias_skip_layer_norm() - - if options is not None and options.enable_gelu_approximation: - self.gelu_approximation() - - self.fuse_custom_fc_activation() - - self.remove_unused_constant() - - # Use symbolic batch dimension in input and output. - if add_dynamic_axes: - self.use_dynamic_axes() - - logger.info(f"opset version: {self.get_opset_version()}") - - def get_fused_operator_statistics(self): - """ - Returns node count of fused operators. - """ - op_count = {} - ops = [ - "EmbedLayerNormalization", - "Attention", - "QOrderedAttention", - "Gelu", - "QOrderedGelu", - "FastGelu", - "BiasGelu", - "LayerNormalization", - "QOrderedLayerNormalization", - "SkipLayerNormalization", - "QOrderedMatMul", - ] - for op in ops: - nodes = self.get_nodes_by_op_type(op) - op_count[op] = len(nodes) - logger.info(f"Optimized operators:{op_count}") - return op_count - - def is_fully_optimized(self): - """ - Returns True when the model is fully optimized. - """ - op_count = self.get_fused_operator_statistics() - embed = op_count["EmbedLayerNormalization"] - attention = op_count["Attention"] + op_count["QOrderedAttention"] - gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"] - layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"] - is_perfect = ( - (embed > 0) - and (attention > 0) - and (attention == gelu) - and (layer_norm >= 2 * attention) - ) - - if layer_norm == 0: - logger.debug("Layer Normalization not fused") - - if gelu == 0: - logger.debug("Gelu/FastGelu not fused") - - if embed == 0: - logger.debug("Embed Layer not fused") - - if attention == 0: - logger.warning("Attention not fused") - - return is_perfect \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py deleted file mode 100644 index 7324603e61bb7a13a57e586827c8fa67a9af4ae2..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py +++ /dev/null @@ -1,627 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -import onnx -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper -from passes.fuse_series_bias_add import FusionSerialBiasAdd -from passes.fusion_albert_attention import FusionAlbertAttention -from passes.fusion_attention import AttentionMask, FusionAttention -from passes.fusion_biasgelu import FusionBiasGelu -from passes.fusion_customfc import ( - FusionCustomFC, - FusionCustomFCActivation, - FusionCustomFCGPT2, - FusionTorchvisionVitCustomFC, -) -from passes.fusion_disentangled_attention import FusionDisentangledAttention -from passes.fusion_embedlayer import FusionEmbedLayerNormalization -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_format_roformer import ( - FusionFormatInvalidMask, - FusionRemoveUselessElementwise, -) -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast -from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_qordered_matmul import FusionQOrderedMatMul -from passes.fusion_reshape import FusionReshape -from passes.fusion_shape import FusionShape -from passes.fusion_skiplayernorm import ( - FusionBiasSkipLayerNormalization, - FusionSkipLayerNormalization, -) -from passes.fusion_swinl_attention import FusionSwinLAttention -from passes.fusion_utils import FusionUtils -from passes.fusion_videobert_attention import FusionVideoBertAttention -from passes.fusion_vit_attention import FusionVITAttention, FusionTorchvisionVITAttention -from passes.fusion_xsoftmax import FusionXSoftmax -from passes.fuse_inverse_sigmoid import FusionLayerInverseSigmoid -from passes.fuse_l2_normalization import FusionLayerL2Normalization -from passes.fuse_omdet_attention import FusionLayerOmdetAttention -from passes.onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class BertOptimizationOptions(FusionOptions): - """This class is deprecated""" - - def __init__(self, model_type): - logger.warning( - f"BertOptimizationOptions is depreciated. Please use FusionOptions instead." - ) - super().__init__(model_type) - - -class BertOnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): - """Initialize BERT ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - - super().__init__(model) - self.num_heads = num_heads - self.hidden_size = hidden_size - - self.attention_mask = AttentionMask(self) - self.attention_fusion = FusionAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.qordered_attention_fusion = FusionQOrderedAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.utils = FusionUtils(self) - - def fuse_attention(self): - self.attention_fusion.apply() - FusionAlbertAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ).apply() - FusionVideoBertAttention(self).apply() - FusionVITAttention(self).apply() - FusionTorchvisionVITAttention(self).apply() - FusionSwinLAttention(self).apply() - FusionGptAttentionNoPast(self).apply() - # Only relevant in models with Q-DQ nodes - self.qordered_attention_fusion.apply() - - def fuse_format_roformer(self): - FusionRemoveUselessElementwise(self).apply() - fusion = FusionFormatInvalidMask(self) - fusion.apply() - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_torchvision_vit(self): - fusion = FusionTorchvisionVitCustomFC(self) - fusion.apply() - - def fuse_custom_fc_activation(self): - fusion = FusionCustomFCActivation(self) - fusion.apply() - - def fuse_custom_fc_gpt2_classify(self): - fusion = FusionCustomFCGPT2(self) - fusion.apply() - - def fuse_swinT_serial_bias_add(self): - fusion = FusionSerialBiasAdd(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_bias_gelu(self, is_fastgelu): - fusion = FusionBiasGelu(self, is_fastgelu) - fusion.apply() - - def fuse_custom_xsoftmax(self): - fusion = FusionXSoftmax(self) - fusion.apply() - - def fuse_disentangled_attention(self): - fusion = FusionDisentangledAttention(self) - fusion.apply() - - def gelu_approximation(self): - fusion = FusionGeluApproximation(self) - fusion.apply() - - def fuse_add_bias_skip_layer_norm(self): - fusion = FusionBiasSkipLayerNormalization(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_embed_layer(self): - fusion = FusionEmbedLayerNormalization(self) - fusion.apply() - - def fuse_layer_norm(self): - fusion = FusionLayerNormalization(self, self.hidden_size) - fusion.apply() - - fusion = FusionLayerNormalizationTF(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedLayerNormalization(self) - fusion.apply() - - def fuse_skip_layer_norm(self): - fusion = FusionSkipLayerNormalization(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - def fuse_qordered_mamtul(self): - fusion = FusionQOrderedMatMul(self) - fusion.apply() - - def fuse_omdet_inverse_sigmoid(self): - fusion = FusionLayerInverseSigmoid(self) - fusion.apply() - - def fuse_omdet_attention(self): - fusion = FusionLayerOmdetAttention(self) - fusion.apply() - - def fuse_l2_normalization(self): - fusion = FusionLayerL2Normalization(self) - fusion.apply() - - def get_graph_inputs_from_node_type( - self, op_type: str, input_indices: List[int], casted: bool - ): - """ - Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). - Returns a list of the graph input names based on the filter whether it is casted or not. - """ - graph_inputs = [] - - output_name_to_node = self.output_name_to_node() - nodes = self.get_nodes_by_op_type(op_type) - for node in nodes: - bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)] - for bert_input in bert_inputs: - if self.find_graph_input(bert_input): - if not casted: - graph_inputs.append(bert_input) - elif bert_input in output_name_to_node: - parent = output_name_to_node[bert_input] - if ( - parent.op_type == "Cast" - and self.find_graph_input(parent.input[0]) is not None - ): - if casted: - graph_inputs.append(parent.input[0]) - return graph_inputs - - def get_graph_inputs_from_fused_nodes(self, casted: bool): - inputs = self.get_graph_inputs_from_node_type( - "EmbedLayerNormalization", [0, 1, 7], casted - ) - inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) - return inputs - - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - - def change_graph_inputs_to_int32(self): - """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" - graph = self.graph() - add_cast_count = 0 - remove_cast_count = 0 - for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type( - graph, graph_input, TensorProto.INT32 - ) - if new_node: - add_cast_count += 1 - remove_cast_count += len(removed_nodes) - logger.info( - f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes." - ) - - def use_dynamic_axes( - self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len" - ): - """ - Update input and output shape to use dynamic axes. - """ - bert_graph_inputs = self.get_graph_inputs_from_fused_nodes( - casted=True - ) + self.get_graph_inputs_from_fused_nodes(casted=False) - - dynamic_batch_inputs = {} - for input in self.model.graph.input: - if input.name in bert_graph_inputs: - dim_proto = input.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - if dynamic_seq_len is not None: - dim_proto = input.type.tensor_type.shape.dim[1] - dim_proto.dim_param = dynamic_seq_len - - for output in self.model.graph.output: - dim_proto = output.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - - def preprocess(self): - self.adjust_reshape_and_expand() - return - - def adjust_reshape_and_expand(self): - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Reshape": - # Clean up unneccessary reshape nodes. - # Find reshape nodes with no actually data in "shape" attribute and remove. - reshape_shape = self.get_constant_value(node.input[1]) - if reshape_shape is not None and reshape_shape.size == 0: - nodes_to_remove.extend([node]) - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - continue - - # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by - # changing current reshape's input to output of slice. - reshape_path = self.match_parent_path( - node, - ["Expand", "Expand", "Reshape", "Slice"], - [0, 0, 0, 0], - self.output_name_to_node(), - ) - if reshape_path is not None: - expand_node = reshape_path[-3] - expand_shape_value = self.get_constant_value(expand_node.input[1]) - - reshape_before_expand = reshape_path[-2] - shape_value = self.get_constant_value( - reshape_before_expand.input[1] - ) - - slice_node = reshape_path[-1] - if ( - expand_shape_value is not None - and shape_value is not None - and len(expand_shape_value) == 2 - and len(shape_value) == 1 - and expand_shape_value[1] == shape_value[0] - ): - node.input[0] = slice_node.output[0] - - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}") - - def clean_graph(self): - output_name_to_node = self.output_name_to_node() - nodes_to_remove = [] - for node in self.nodes(): - # Before: - # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+ - # | | - # | v - # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # After: - # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value) - op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3} - if node.op_type in op_input_id: - i = op_input_id[node.op_type] - parent_nodes = self.match_parent_path( - node, - [ - "Cast", - "ConstantOfShape", - "Concat", - "Unsqueeze", - "Gather", - "Shape", - ], - [i, 0, 0, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - ( - cast, - constantOfShape, - concat, - unsqueeze, - gather, - shape, - ) = parent_nodes - if shape.input[0] == self.graph().input[0].name: - constantOfShape.input[0] = shape.output[0] - output_name_to_node = self.output_name_to_node() - - if node.op_type == "Attention": - # Before: - # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention - # After: - # remove this path, and remove the optional mask_index input of Attention node. - parent_nodes = self.match_parent_path( - node, - ["ReduceSum", "Cast", "ConstantOfShape", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - if parent_nodes[-1].input[0] == self.graph().input[0].name: - attention_node = helper.make_node( - "Attention", - inputs=node.input[0 : len(node.input) - 1], - outputs=node.output, - name=node.name + "_remove_mask", - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [helper.make_attribute("num_heads", self.num_heads)] - ) - self.add_node( - attention_node, self.get_graph_by_node(attention_node).name - ) - nodes_to_remove.append(node) - self.remove_nodes(nodes_to_remove) - - def postprocess(self): - self.clean_graph() - self.prune_graph() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.preprocess() - - self.fuse_reshape() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - if options.enable_swint_opt: - self.fuse_custom_fc() - self.fuse_swinT_serial_bias_add() - - if options.enable_format_roformer: - self.fuse_format_roformer() - - if options.enable_gpt2_classify or options.enable_vit: - self.fuse_custom_fc_gpt2_classify() - - if options.enable_vit: - self.fuse_custom_fc() - - if (options is None) or options.enable_attention: - if options is not None: - self.attention_mask.set_mask_format(options.attention_mask_format) - self.fuse_attention() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - self.fuse_custom_fc() - - if options.enable_omdet: - self.fuse_omdet_attention() - self.fuse_omdet_inverse_sigmoid() - self.fuse_l2_normalization() - - self.fuse_custom_xsoftmax() - - self.fuse_disentangled_attention() - - # Perform the MatMul fusion after the Attention fusion as we do not - # want to fuse the MatMuls inside the Attention subgraphs - if (options is None) or options.enable_qordered_matmul: - self.fuse_qordered_mamtul() - - self.fuse_shape() - - if (options is None) or options.enable_embed_layer_norm: - self.fuse_embed_layer() - - # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. - self.utils.remove_useless_reshape_nodes() - - self.postprocess() - - # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization - if (options is None) or options.enable_bias_gelu: - # Fuse Gelu and Add Bias before it. - self.fuse_bias_gelu(is_fastgelu=True) - self.fuse_bias_gelu(is_fastgelu=False) - - if (options is None) or options.enable_bias_skip_layer_norm: - # Fuse SkipLayerNormalization and Add Bias before it. - self.fuse_add_bias_skip_layer_norm() - - if options is not None and options.enable_gelu_approximation: - self.gelu_approximation() - - self.fuse_custom_fc_activation() - - if options.enable_vit: - self.fuse_custom_fc_torchvision_vit() - - self.remove_unused_constant() - - # Use symbolic batch dimension in input and output. - if add_dynamic_axes: - self.use_dynamic_axes() - - logger.info(f"opset version: {self.get_opset_version()}") - - def get_fused_operator_statistics(self): - """ - Returns node count of fused operators. - """ - op_count = {} - ops = [ - "EmbedLayerNormalization", - "Attention", - "QOrderedAttention", - "Gelu", - "QOrderedGelu", - "FastGelu", - "BiasGelu", - "LayerNormalization", - "QOrderedLayerNormalization", - "SkipLayerNormalization", - "QOrderedMatMul", - ] - for op in ops: - nodes = self.get_nodes_by_op_type(op) - op_count[op] = len(nodes) - logger.info(f"Optimized operators:{op_count}") - return op_count - - def is_fully_optimized(self): - """ - Returns True when the model is fully optimized. - """ - op_count = self.get_fused_operator_statistics() - embed = op_count["EmbedLayerNormalization"] - attention = op_count["Attention"] + op_count["QOrderedAttention"] - gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"] - layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"] - is_perfect = ( - (embed > 0) - and (attention > 0) - and (attention == gelu) - and (layer_norm >= 2 * attention) - ) - - if layer_norm == 0: - logger.debug("Layer Normalization not fused") - - if gelu == 0: - logger.debug("Gelu/FastGelu not fused") - - if embed == 0: - logger.debug("Embed Layer not fused") - - if attention == 0: - logger.warning("Attention not fused") - - return is_perfect diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py deleted file mode 100644 index cc59c37bd48f677a7d06f141f45eaa55aef54656..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py +++ /dev/null @@ -1,591 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -import onnx -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper -from passes.fuse_series_bias_add import FusionSerialBiasAdd -from passes.fusion_albert_attention import FusionAlbertAttention -from passes.fusion_attention import AttentionMask, FusionAttention -from passes.fusion_biasgelu import FusionBiasGelu -from passes.fusion_conformer_attention import FusionConformerAttention -from passes.fusion_conformer_xsoftmax import FusionConformerXSoftmax -from passes.fusion_customfc import ( - FusionConformerCustomFCActivation, - FusionCustomFC, - FusionCustomFCGPT2, -) -from passes.fusion_disentangled_attention import FusionDisentangledAttention -from passes.fusion_embedlayer import FusionEmbedLayerNormalization -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_format_roformer import ( - FusionFormatInvalidMask, - FusionRemoveUselessElementwise, -) -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast -from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_qordered_matmul import FusionQOrderedMatMul -from passes.fusion_reshape import FusionReshape -from passes.fusion_shape import FusionShape -from passes.fusion_skiplayernorm import ( - FusionBiasSkipLayerNormalization, - FusionSkipLayerNormalization, -) -from passes.fusion_splitQKV import FusionSplitQKV -from passes.fusion_swinl_attention import FusionSwinLAttention -from passes.fusion_utils import FusionUtils -from passes.fusion_vit_attention import FusionVITAttention -from passes.onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class ConformerOptimizationOptions(FusionOptions): - """This class is deprecated""" - - def __init__(self, model_type): - logger.warning( - f"BertOptimizationOptions is depreciated. Please use FusionOptions instead." - ) - super().__init__(model_type) - - -class conformerOnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): - """Initialize BERT ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - - super().__init__(model) - self.num_heads = num_heads - self.hidden_size = hidden_size - - self.attention_mask = AttentionMask(self) - self.attention_fusion = FusionAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.qordered_attention_fusion = FusionQOrderedAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.utils = FusionUtils(self) - - def fuse_attention(self): - FusionConformerAttention(self, self.hidden_size, self.num_heads).apply() - # Only relevant in models with Q-DQ nodes - self.qordered_attention_fusion.apply() - - def fuse_format_roformer(self): - FusionRemoveUselessElementwise(self).apply() - fusion = FusionFormatInvalidMask(self) - fusion.apply() - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_conformer_activation(self): - fusion = FusionConformerCustomFCActivation(self) - fusion.apply() - - def fuse_custom_fc_gpt2_classify(self): - fusion = FusionCustomFCGPT2(self) - fusion.apply() - - def fuse_swinT_serial_bias_add(self): - fusion = FusionSerialBiasAdd(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_bias_gelu(self, is_fastgelu): - fusion = FusionBiasGelu(self, is_fastgelu) - fusion.apply() - - def fuse_custom_xsoftmax(self): - fusion = FusionConformerXSoftmax(self) - fusion.apply() - - def fuse_disentangled_attention(self): - fusion = FusionDisentangledAttention(self) - fusion.apply() - - def gelu_approximation(self): - fusion = FusionGeluApproximation(self) - fusion.apply() - - def fuse_add_bias_skip_layer_norm(self): - fusion = FusionBiasSkipLayerNormalization(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_embed_layer(self): - fusion = FusionEmbedLayerNormalization(self) - fusion.apply() - - def fuse_layer_norm(self): - fusion = FusionLayerNormalization(self, self.hidden_size) - fusion.apply() - - fusion = FusionLayerNormalizationTF(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedLayerNormalization(self) - fusion.apply() - - def fuse_skip_layer_norm(self): - fusion = FusionSkipLayerNormalization(self) - fusion.apply() - - def fuse_split_qkv(self): - fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - def fuse_qordered_mamtul(self): - fusion = FusionQOrderedMatMul(self) - fusion.apply() - - def get_graph_inputs_from_node_type( - self, op_type: str, input_indices: List[int], casted: bool - ): - """ - Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). - Returns a list of the graph input names based on the filter whether it is casted or not. - """ - graph_inputs = [] - - output_name_to_node = self.output_name_to_node() - nodes = self.get_nodes_by_op_type(op_type) - for node in nodes: - bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)] - for bert_input in bert_inputs: - if self.find_graph_input(bert_input): - if not casted: - graph_inputs.append(bert_input) - elif bert_input in output_name_to_node: - parent = output_name_to_node[bert_input] - if ( - parent.op_type == "Cast" - and self.find_graph_input(parent.input[0]) is not None - ): - if casted: - graph_inputs.append(parent.input[0]) - return graph_inputs - - def get_graph_inputs_from_fused_nodes(self, casted: bool): - inputs = self.get_graph_inputs_from_node_type( - "EmbedLayerNormalization", [0, 1, 7], casted - ) - inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) - return inputs - - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - - def change_graph_inputs_to_int32(self): - """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" - graph = self.graph() - add_cast_count = 0 - remove_cast_count = 0 - for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type( - graph, graph_input, TensorProto.INT32 - ) - if new_node: - add_cast_count += 1 - remove_cast_count += len(removed_nodes) - logger.info( - f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes." - ) - - def use_dynamic_axes( - self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len" - ): - """ - Update input and output shape to use dynamic axes. - """ - bert_graph_inputs = self.get_graph_inputs_from_fused_nodes( - casted=True - ) + self.get_graph_inputs_from_fused_nodes(casted=False) - - dynamic_batch_inputs = {} - for input in self.model.graph.input: - if input.name in bert_graph_inputs: - dim_proto = input.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - if dynamic_seq_len is not None: - dim_proto = input.type.tensor_type.shape.dim[1] - dim_proto.dim_param = dynamic_seq_len - - for output in self.model.graph.output: - dim_proto = output.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - - def preprocess(self): - self.adjust_reshape_and_expand() - return - - def adjust_reshape_and_expand(self): - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Reshape": - # Clean up unneccessary reshape nodes. - # Find reshape nodes with no actually data in "shape" attribute and remove. - reshape_shape = self.get_constant_value(node.input[1]) - if reshape_shape is not None and reshape_shape.size == 0: - nodes_to_remove.extend([node]) - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - continue - - # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by - # changing current reshape's input to output of slice. - reshape_path = self.match_parent_path( - node, - ["Expand", "Expand", "Reshape", "Slice"], - [0, 0, 0, 0], - self.output_name_to_node(), - ) - if reshape_path is not None: - expand_node = reshape_path[-3] - expand_shape_value = self.get_constant_value(expand_node.input[1]) - - reshape_before_expand = reshape_path[-2] - shape_value = self.get_constant_value( - reshape_before_expand.input[1] - ) - - slice_node = reshape_path[-1] - if ( - expand_shape_value is not None - and shape_value is not None - and len(expand_shape_value) == 2 - and len(shape_value) == 1 - and expand_shape_value[1] == shape_value[0] - ): - node.input[0] = slice_node.output[0] - - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}") - - def clean_graph(self): - output_name_to_node = self.output_name_to_node() - nodes_to_remove = [] - for node in self.nodes(): - # Before: - # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+ - # | | - # | v - # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # After: - # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value) - op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3} - if node.op_type in op_input_id: - i = op_input_id[node.op_type] - parent_nodes = self.match_parent_path( - node, - [ - "Cast", - "ConstantOfShape", - "Concat", - "Unsqueeze", - "Gather", - "Shape", - ], - [i, 0, 0, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - ( - cast, - constantOfShape, - concat, - unsqueeze, - gather, - shape, - ) = parent_nodes - if shape.input[0] == self.graph().input[0].name: - constantOfShape.input[0] = shape.output[0] - output_name_to_node = self.output_name_to_node() - - if node.op_type == "Attention": - # Before: - # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention - # After: - # remove this path, and remove the optional mask_index input of Attention node. - parent_nodes = self.match_parent_path( - node, - ["ReduceSum", "Cast", "ConstantOfShape", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - if parent_nodes[-1].input[0] == self.graph().input[0].name: - attention_node = helper.make_node( - "Attention", - inputs=node.input[0 : len(node.input) - 1], - outputs=node.output, - name=node.name + "_remove_mask", - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [helper.make_attribute("num_heads", self.num_heads)] - ) - self.add_node( - attention_node, self.get_graph_by_node(attention_node).name - ) - nodes_to_remove.append(node) - self.remove_nodes(nodes_to_remove) - - def postprocess(self): - self.clean_graph() - self.prune_graph() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.preprocess() - - self.fuse_reshape() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - if options.enable_swint_opt: - self.fuse_custom_fc() - self.fuse_swinT_serial_bias_add() - - if options.enable_format_roformer: - self.fuse_format_roformer() - - if options.enable_gpt2_classify or options.enable_vit: - self.fuse_custom_fc_gpt2_classify() - - if options.enable_vit: - self.fuse_custom_fc() - - self.fuse_custom_fc() - self.fuse_custom_xsoftmax() - - self.fuse_attention() - - self.fuse_split_qkv() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - # Perform the MatMul fusion after the Attention fusion as we do not - # want to fuse the MatMuls inside the Attention subgraphs - if (options is None) or options.enable_qordered_matmul: - self.fuse_qordered_mamtul() - - self.fuse_shape() - - if (options is None) or options.enable_embed_layer_norm: - self.fuse_embed_layer() - - # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. - self.utils.remove_useless_reshape_nodes() - - self.postprocess() - - # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization - if (options is None) or options.enable_bias_gelu: - # Fuse Gelu and Add Bias before it. - self.fuse_bias_gelu(is_fastgelu=True) - self.fuse_bias_gelu(is_fastgelu=False) - - if (options is None) or options.enable_bias_skip_layer_norm: - # Fuse SkipLayerNormalization and Add Bias before it. - self.fuse_add_bias_skip_layer_norm() - - if options is not None and options.enable_gelu_approximation: - self.gelu_approximation() - - self.remove_unused_constant() - self.fuse_custom_fc_conformer_activation() - - # Use symbolic batch dimension in input and output. - if add_dynamic_axes: - self.use_dynamic_axes() - - logger.info(f"opset version: {self.get_opset_version()}") - - def get_fused_operator_statistics(self): - """ - Returns node count of fused operators. - """ - op_count = {} - ops = [ - "EmbedLayerNormalization", - "Attention", - "QOrderedAttention", - "Gelu", - "QOrderedGelu", - "FastGelu", - "BiasGelu", - "LayerNormalization", - "QOrderedLayerNormalization", - "SkipLayerNormalization", - "QOrderedMatMul", - ] - for op in ops: - nodes = self.get_nodes_by_op_type(op) - op_count[op] = len(nodes) - logger.info(f"Optimized operators:{op_count}") - return op_count - - def is_fully_optimized(self): - """ - Returns True when the model is fully optimized. - """ - op_count = self.get_fused_operator_statistics() - embed = op_count["EmbedLayerNormalization"] - attention = op_count["Attention"] + op_count["QOrderedAttention"] - gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"] - layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"] - is_perfect = ( - (embed > 0) - and (attention > 0) - and (attention == gelu) - and (layer_norm >= 2 * attention) - ) - - if layer_norm == 0: - logger.debug("Layer Normalization not fused") - - if gelu == 0: - logger.debug("Gelu/FastGelu not fused") - - if embed == 0: - logger.debug("Embed Layer not fused") - - if attention == 0: - logger.warning("Attention not fused") - - return is_perfect diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py deleted file mode 100755 index 98cfc6699ab5276f2fd37915a62487a173fb4d12..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py +++ /dev/null @@ -1,640 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -import onnx -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper -from passes.fuse_series_bias_add import FusionSerialBiasAdd -from passes.fusion_albert_attention import FusionAlbertAttention -from passes.fusion_attention import AttentionMask, FusionAttention -from passes.fusion_biasgelu import FusionBiasGelu -from passes.fusion_customfc import ( - FusionCustomFC, - FusionCustomFCActivation, - FusionCustomFCGPT2, - FusionTorchvisionVitCustomFC, -) -from passes.fusion_disentangled_attention import FusionDisentangledAttention -from passes.fusion_embedlayer import FusionEmbedLayerNormalization -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_format_roformer import ( - FusionFormatInvalidMask, - FusionRemoveUselessElementwise, -) -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast -from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_qordered_matmul import FusionQOrderedMatMul -from passes.fusion_reshape import FusionReshape -from passes.fusion_shape import FusionShape -from passes.fusion_skiplayernorm import ( - FusionBiasSkipLayerNormalization, - FusionSkipLayerNormalization, -) -from passes.fusion_swinl_attention import FusionSwinLAttention -from passes.fusion_utils import FusionUtils -from passes.fusion_videobert_attention import FusionVideoBertAttention -from passes.fusion_vit_attention import FusionVITAttention, FusionTorchvisionVITAttention -from passes.fusion_xsoftmax import FusionXSoftmax -from passes.fuse_inverse_sigmoid import FusionLayerInverseSigmoid -from passes.fuse_l2_normalization import FusionLayerL2Normalization -from passes.fuse_omdet_attention import FusionLayerOmdetAttention -from passes.onnx_model import OnnxModel - -from passes.fusion_cosyvoice_splitQKV_update_KVcache import FusionCosyVoiceSplitQKVUpdateKVCache -from passes.fusion_cosyvoice_attention import ( - FusionCosyvoiceAttention -) -from passes.fusion_cosyvoice_splitQKV import FusionSplitQKV - - - -logger = getLogger(__name__) - - - -class cosyvoiceOnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads: int = 16, hidden_size: int = 1024): - """Initialize BERT ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - - super().__init__(model) - self.num_heads = num_heads - self.hidden_size = hidden_size - - self.attention_mask = AttentionMask(self) - self.attention_fusion = FusionAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.qordered_attention_fusion = FusionQOrderedAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.utils = FusionUtils(self) - - def fuse_attention(self): - self.attention_fusion.apply() - FusionAlbertAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ).apply() - FusionVideoBertAttention(self).apply() - FusionVITAttention(self).apply() - FusionTorchvisionVITAttention(self).apply() - FusionSwinLAttention(self).apply() - FusionGptAttentionNoPast(self).apply() - # Only relevant in models with Q-DQ nodes - self.qordered_attention_fusion.apply() - - def fuse_format_roformer(self): - FusionRemoveUselessElementwise(self).apply() - fusion = FusionFormatInvalidMask(self) - fusion.apply() - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_torchvision_vit(self): - fusion = FusionTorchvisionVitCustomFC(self) - fusion.apply() - - def fuse_custom_fc_activation(self): - fusion = FusionCustomFCActivation(self) - fusion.apply() - - def fuse_custom_fc_gpt2_classify(self): - fusion = FusionCustomFCGPT2(self) - fusion.apply() - - def fuse_swinT_serial_bias_add(self): - fusion = FusionSerialBiasAdd(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_bias_gelu(self, is_fastgelu): - fusion = FusionBiasGelu(self, is_fastgelu) - fusion.apply() - - def fuse_custom_xsoftmax(self): - fusion = FusionXSoftmax(self) - fusion.apply() - - def fuse_disentangled_attention(self): - fusion = FusionDisentangledAttention(self) - fusion.apply() - - def gelu_approximation(self): - fusion = FusionGeluApproximation(self) - fusion.apply() - - def fuse_add_bias_skip_layer_norm(self): - fusion = FusionBiasSkipLayerNormalization(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_embed_layer(self): - fusion = FusionEmbedLayerNormalization(self) - fusion.apply() - - def fuse_layer_norm(self): - fusion = FusionLayerNormalization(self, self.hidden_size) - fusion.apply() - - fusion = FusionLayerNormalizationTF(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedLayerNormalization(self) - fusion.apply() - - def fuse_skip_layer_norm(self): - fusion = FusionSkipLayerNormalization(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - def fuse_qordered_mamtul(self): - fusion = FusionQOrderedMatMul(self) - fusion.apply() - - def fuse_omdet_inverse_sigmoid(self): - fusion = FusionLayerInverseSigmoid(self) - fusion.apply() - - def fuse_omdet_attention(self): - fusion = FusionLayerOmdetAttention(self) - fusion.apply() - - def fuse_l2_normalization(self): - fusion = FusionLayerL2Normalization(self) - fusion.apply() - - def fuse_splitQKV_update_kv_cache(self): - fusion = FusionCosyVoiceSplitQKVUpdateKVCache(self, self.hidden_size, self.num_heads) - fusion.apply() - - def fuse_cosyvoice_attention(self): - fusion = FusionCosyvoiceAttention(self) - fusion.apply() - - def fuse_cosyvoice_split_qkv(self): - fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads) - fusion.apply() - - - def get_graph_inputs_from_node_type( - self, op_type: str, input_indices: List[int], casted: bool - ): - """ - Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). - Returns a list of the graph input names based on the filter whether it is casted or not. - """ - graph_inputs = [] - - output_name_to_node = self.output_name_to_node() - nodes = self.get_nodes_by_op_type(op_type) - for node in nodes: - bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)] - for bert_input in bert_inputs: - if self.find_graph_input(bert_input): - if not casted: - graph_inputs.append(bert_input) - elif bert_input in output_name_to_node: - parent = output_name_to_node[bert_input] - if ( - parent.op_type == "Cast" - and self.find_graph_input(parent.input[0]) is not None - ): - if casted: - graph_inputs.append(parent.input[0]) - return graph_inputs - - def get_graph_inputs_from_fused_nodes(self, casted: bool): - inputs = self.get_graph_inputs_from_node_type( - "EmbedLayerNormalization", [0, 1, 7], casted - ) - inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) - return inputs - - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - - def change_graph_inputs_to_int32(self): - """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" - graph = self.graph() - add_cast_count = 0 - remove_cast_count = 0 - for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type( - graph, graph_input, TensorProto.INT32 - ) - if new_node: - add_cast_count += 1 - remove_cast_count += len(removed_nodes) - logger.info( - f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes." - ) - - def use_dynamic_axes( - self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len" - ): - """ - Update input and output shape to use dynamic axes. - """ - bert_graph_inputs = self.get_graph_inputs_from_fused_nodes( - casted=True - ) + self.get_graph_inputs_from_fused_nodes(casted=False) - - dynamic_batch_inputs = {} - for input in self.model.graph.input: - if input.name in bert_graph_inputs: - dim_proto = input.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - if dynamic_seq_len is not None: - dim_proto = input.type.tensor_type.shape.dim[1] - dim_proto.dim_param = dynamic_seq_len - - for output in self.model.graph.output: - dim_proto = output.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - - def preprocess(self): - self.adjust_reshape_and_expand() - return - - def adjust_reshape_and_expand(self): - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Reshape": - # Clean up unneccessary reshape nodes. - # Find reshape nodes with no actually data in "shape" attribute and remove. - reshape_shape = self.get_constant_value(node.input[1]) - if reshape_shape is not None and reshape_shape.size == 0: - nodes_to_remove.extend([node]) - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - continue - - # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by - # changing current reshape's input to output of slice. - reshape_path = self.match_parent_path( - node, - ["Expand", "Expand", "Reshape", "Slice"], - [0, 0, 0, 0], - self.output_name_to_node(), - ) - if reshape_path is not None: - expand_node = reshape_path[-3] - expand_shape_value = self.get_constant_value(expand_node.input[1]) - - reshape_before_expand = reshape_path[-2] - shape_value = self.get_constant_value( - reshape_before_expand.input[1] - ) - - slice_node = reshape_path[-1] - if ( - expand_shape_value is not None - and shape_value is not None - and len(expand_shape_value) == 2 - and len(shape_value) == 1 - and expand_shape_value[1] == shape_value[0] - ): - node.input[0] = slice_node.output[0] - - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}") - - def clean_graph(self): - output_name_to_node = self.output_name_to_node() - nodes_to_remove = [] - for node in self.nodes(): - # Before: - # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+ - # | | - # | v - # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # After: - # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value) - op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3} - if node.op_type in op_input_id: - i = op_input_id[node.op_type] - parent_nodes = self.match_parent_path( - node, - [ - "Cast", - "ConstantOfShape", - "Concat", - "Unsqueeze", - "Gather", - "Shape", - ], - [i, 0, 0, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - ( - cast, - constantOfShape, - concat, - unsqueeze, - gather, - shape, - ) = parent_nodes - if shape.input[0] == self.graph().input[0].name: - constantOfShape.input[0] = shape.output[0] - output_name_to_node = self.output_name_to_node() - - if node.op_type == "Attention": - # Before: - # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention - # After: - # remove this path, and remove the optional mask_index input of Attention node. - parent_nodes = self.match_parent_path( - node, - ["ReduceSum", "Cast", "ConstantOfShape", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - if parent_nodes[-1].input[0] == self.graph().input[0].name: - attention_node = helper.make_node( - "Attention", - inputs=node.input[0 : len(node.input) - 1], - outputs=node.output, - name=node.name + "_remove_mask", - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [helper.make_attribute("num_heads", self.num_heads)] - ) - self.add_node( - attention_node, self.get_graph_by_node(attention_node).name - ) - nodes_to_remove.append(node) - self.remove_nodes(nodes_to_remove) - - def postprocess(self): - self.clean_graph() - self.prune_graph() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.preprocess() - - self.fuse_reshape() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - if options.enable_swint_opt: - self.fuse_custom_fc() - self.fuse_swinT_serial_bias_add() - - if options.enable_format_roformer: - self.fuse_format_roformer() - - if options.enable_gpt2_classify or options.enable_vit: - self.fuse_custom_fc_gpt2_classify() - - if options.enable_vit: - self.fuse_custom_fc() - - if (options is None) or options.enable_attention: - if options is not None: - self.attention_mask.set_mask_format(options.attention_mask_format) - self.fuse_attention() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - self.fuse_custom_fc() - - if options.enable_omdet: - self.fuse_omdet_attention() - self.fuse_omdet_inverse_sigmoid() - self.fuse_l2_normalization() - - self.fuse_splitQKV_update_kv_cache() - self.fuse_cosyvoice_attention() - self.fuse_cosyvoice_split_qkv() - - - # Perform the MatMul fusion after the Attention fusion as we do not - # want to fuse the MatMuls inside the Attention subgraphs - if (options is None) or options.enable_qordered_matmul: - self.fuse_qordered_mamtul() - - self.fuse_shape() - - if (options is None) or options.enable_embed_layer_norm: - self.fuse_embed_layer() - - # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. - self.utils.remove_useless_reshape_nodes() - - self.postprocess() - - # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization - if (options is None) or options.enable_bias_gelu: - # Fuse Gelu and Add Bias before it. - self.fuse_bias_gelu(is_fastgelu=True) - self.fuse_bias_gelu(is_fastgelu=False) - - if (options is None) or options.enable_bias_skip_layer_norm: - # Fuse SkipLayerNormalization and Add Bias before it. - self.fuse_add_bias_skip_layer_norm() - - if options is not None and options.enable_gelu_approximation: - self.gelu_approximation() - - self.fuse_custom_fc_activation() - - if options.enable_vit: - self.fuse_custom_fc_torchvision_vit() - - self.remove_unused_constant() - - # Use symbolic batch dimension in input and output. - if add_dynamic_axes: - self.use_dynamic_axes() - - logger.info(f"opset version: {self.get_opset_version()}") - - def get_fused_operator_statistics(self): - """ - Returns node count of fused operators. - """ - op_count = {} - ops = [ - "EmbedLayerNormalization", - "Attention", - "QOrderedAttention", - "Gelu", - "QOrderedGelu", - "FastGelu", - "BiasGelu", - "LayerNormalization", - "QOrderedLayerNormalization", - "SkipLayerNormalization", - "QOrderedMatMul", - ] - for op in ops: - nodes = self.get_nodes_by_op_type(op) - op_count[op] = len(nodes) - logger.info(f"Optimized operators:{op_count}") - return op_count - - def is_fully_optimized(self): - """ - Returns True when the model is fully optimized. - """ - op_count = self.get_fused_operator_statistics() - embed = op_count["EmbedLayerNormalization"] - attention = op_count["Attention"] + op_count["QOrderedAttention"] - gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"] - layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"] - is_perfect = ( - (embed > 0) - and (attention > 0) - and (attention == gelu) - and (layer_norm >= 2 * attention) - ) - - if layer_norm == 0: - logger.debug("Layer Normalization not fused") - - if gelu == 0: - logger.debug("Gelu/FastGelu not fused") - - if embed == 0: - logger.debug("Embed Layer not fused") - - if attention == 0: - logger.warning("Attention not fused") - - return is_perfect diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py deleted file mode 100644 index 7bffb2e7cbec870423cd006d33a617dd1e70d1fb..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py +++ /dev/null @@ -1,555 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -import onnx -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper -from passes.fuse_series_bias_add import FusionSerialBiasAdd -from passes.fusion_albert_attention import FusionAlbertAttention -from passes.fusion_attention import AttentionMask, FusionAttention -from passes.fusion_biasgelu import FusionBiasGelu -from passes.fusion_customfc import ( - FusionCustomFC, - FusionCustomFCActivation, - FusionCustomFcRoformer, -) -from passes.fusion_disentangled_attention import FusionDisentangledAttention -from passes.fusion_embedlayer import FusionEmbedLayerNormalization -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_format_roformer import ( - FusionFormatInvalidMask, - FusionRemoveUselessElementwise, -) -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_layernorm import ( - FusionLayerNormalization, - FusionLayerNormalizationKeras, - FusionLayerNormalizationTF, -) -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_qordered_matmul import FusionQOrderedMatMul -from passes.fusion_reshape import FusionReshape -from passes.fusion_roformer_attention import FusionRoformerCrossAttention -from passes.fusion_rope import FusionRoPE -from passes.fusion_shape import FusionShape -from passes.fusion_skiplayernorm import ( - FusionBiasSkipLayerNormalization, - FusionSkipLayerNormalization, -) -from passes.fusion_swinl_attention import FusionSwinLAttention -from passes.fusion_utils import FusionUtils -from passes.fusion_videobert_attention import FusionVideoBertAttention -from passes.fusion_vit_attention import FusionVITAttention -from passes.fusion_xsoftmax import FusionXSoftmax -from passes.onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class RoformerOnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): - """Initialize BERT ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - - super().__init__(model) - self.num_heads = num_heads - self.hidden_size = hidden_size - - self.attention_mask = AttentionMask(self) - self.attention_fusion = FusionAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.qordered_attention_fusion = FusionQOrderedAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.utils = FusionUtils(self) - - def fuse_attention(self): - FusionRoformerCrossAttention(self).apply() - - def fuse_format_roformer(self): - # FusionRemoveUselessElementwise(self).apply() - fusion = FusionFormatInvalidMask(self) - fusion.apply() - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_activation(self): - fusion = FusionCustomFCActivation(self) - fusion.apply() - - def fuse_custom_fc_roformer(self): - fusion = FusionCustomFcRoformer(self) - fusion.apply() - - def fuse_rope(self): - fusion = FusionRoPE(self) - fusion.apply() - - def fuse_swinT_serial_bias_add(self): - fusion = FusionSerialBiasAdd(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_bias_gelu(self, is_fastgelu): - fusion = FusionBiasGelu(self, is_fastgelu) - fusion.apply() - - def gelu_approximation(self): - fusion = FusionGeluApproximation(self) - fusion.apply() - - def fuse_add_bias_skip_layer_norm(self): - fusion = FusionBiasSkipLayerNormalization(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_embed_layer(self): - fusion = FusionEmbedLayerNormalization(self) - fusion.apply() - - def fuse_layer_norm(self): - fusion = FusionLayerNormalizationKeras(self) - fusion.apply() - - def fuse_skip_layer_norm(self): - fusion = FusionSkipLayerNormalization(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - def fuse_qordered_mamtul(self): - fusion = FusionQOrderedMatMul(self) - fusion.apply() - - def get_graph_inputs_from_node_type( - self, op_type: str, input_indices: List[int], casted: bool - ): - """ - Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). - Returns a list of the graph input names based on the filter whether it is casted or not. - """ - graph_inputs = [] - - output_name_to_node = self.output_name_to_node() - nodes = self.get_nodes_by_op_type(op_type) - for node in nodes: - bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)] - for bert_input in bert_inputs: - if self.find_graph_input(bert_input): - if not casted: - graph_inputs.append(bert_input) - elif bert_input in output_name_to_node: - parent = output_name_to_node[bert_input] - if ( - parent.op_type == "Cast" - and self.find_graph_input(parent.input[0]) is not None - ): - if casted: - graph_inputs.append(parent.input[0]) - return graph_inputs - - def get_graph_inputs_from_fused_nodes(self, casted: bool): - inputs = self.get_graph_inputs_from_node_type( - "EmbedLayerNormalization", [0, 1, 7], casted - ) - inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) - return inputs - - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - - def change_graph_inputs_to_int32(self): - """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" - graph = self.graph() - add_cast_count = 0 - remove_cast_count = 0 - for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type( - graph, graph_input, TensorProto.INT32 - ) - if new_node: - add_cast_count += 1 - remove_cast_count += len(removed_nodes) - logger.info( - f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes." - ) - - def use_dynamic_axes( - self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len" - ): - """ - Update input and output shape to use dynamic axes. - """ - bert_graph_inputs = self.get_graph_inputs_from_fused_nodes( - casted=True - ) + self.get_graph_inputs_from_fused_nodes(casted=False) - - dynamic_batch_inputs = {} - for input in self.model.graph.input: - if input.name in bert_graph_inputs: - dim_proto = input.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - if dynamic_seq_len is not None: - dim_proto = input.type.tensor_type.shape.dim[1] - dim_proto.dim_param = dynamic_seq_len - - for output in self.model.graph.output: - dim_proto = output.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - - def preprocess(self): - self.adjust_reshape_and_expand() - return - - def adjust_reshape_and_expand(self): - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Reshape": - # Clean up unneccessary reshape nodes. - # Find reshape nodes with no actually data in "shape" attribute and remove. - reshape_shape = self.get_constant_value(node.input[1]) - if reshape_shape is not None and reshape_shape.size == 0: - nodes_to_remove.extend([node]) - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - continue - - # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by - # changing current reshape's input to output of slice. - reshape_path = self.match_parent_path( - node, - ["Expand", "Expand", "Reshape", "Slice"], - [0, 0, 0, 0], - self.output_name_to_node(), - ) - if reshape_path is not None: - expand_node = reshape_path[-3] - expand_shape_value = self.get_constant_value(expand_node.input[1]) - - reshape_before_expand = reshape_path[-2] - shape_value = self.get_constant_value( - reshape_before_expand.input[1] - ) - - slice_node = reshape_path[-1] - if ( - expand_shape_value is not None - and shape_value is not None - and len(expand_shape_value) == 2 - and len(shape_value) == 1 - and expand_shape_value[1] == shape_value[0] - ): - node.input[0] = slice_node.output[0] - - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}") - - def clean_graph(self): - output_name_to_node = self.output_name_to_node() - nodes_to_remove = [] - for node in self.nodes(): - # Before: - # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+ - # | | - # | v - # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # After: - # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value) - op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3} - if node.op_type in op_input_id: - i = op_input_id[node.op_type] - parent_nodes = self.match_parent_path( - node, - [ - "Cast", - "ConstantOfShape", - "Concat", - "Unsqueeze", - "Gather", - "Shape", - ], - [i, 0, 0, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - ( - cast, - constantOfShape, - concat, - unsqueeze, - gather, - shape, - ) = parent_nodes - if shape.input[0] == self.graph().input[0].name: - constantOfShape.input[0] = shape.output[0] - output_name_to_node = self.output_name_to_node() - - if node.op_type == "Attention": - # Before: - # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention - # After: - # remove this path, and remove the optional mask_index input of Attention node. - parent_nodes = self.match_parent_path( - node, - ["ReduceSum", "Cast", "ConstantOfShape", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - if parent_nodes[-1].input[0] == self.graph().input[0].name: - attention_node = helper.make_node( - "Attention", - inputs=node.input[0 : len(node.input) - 1], - outputs=node.output, - name=node.name + "_remove_mask", - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [helper.make_attribute("num_heads", self.num_heads)] - ) - self.add_node( - attention_node, self.get_graph_by_node(attention_node).name - ) - nodes_to_remove.append(node) - self.remove_nodes(nodes_to_remove) - - def postprocess(self): - self.clean_graph() - self.prune_graph() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.preprocess() - - self.fuse_reshape() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - if options.enable_format_roformer: - self.fuse_format_roformer() - - self.fuse_custom_fc_roformer() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - self.fuse_custom_fc() - - if (options is None) or options.enable_attention: - if options is not None: - self.attention_mask.set_mask_format(options.attention_mask_format) - self.fuse_attention() - - self.fuse_rope() - - self.fuse_shape() - - # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. - self.utils.remove_useless_reshape_nodes() - - self.postprocess() - - # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization - if (options is None) or options.enable_bias_gelu: - # Fuse Gelu and Add Bias before it. - self.fuse_bias_gelu(is_fastgelu=True) - self.fuse_bias_gelu(is_fastgelu=False) - - if (options is None) or options.enable_bias_skip_layer_norm: - # Fuse SkipLayerNormalization and Add Bias before it. - self.fuse_add_bias_skip_layer_norm() - - if options is not None and options.enable_gelu_approximation: - self.gelu_approximation() - - self.fuse_custom_fc_activation() - - self.remove_unused_constant() - - # Use symbolic batch dimension in input and output. - if add_dynamic_axes: - self.use_dynamic_axes() - - logger.info(f"opset version: {self.get_opset_version()}") - - def get_fused_operator_statistics(self): - """ - Returns node count of fused operators. - """ - op_count = {} - ops = [ - "EmbedLayerNormalization", - "Attention", - "QOrderedAttention", - "Gelu", - "QOrderedGelu", - "FastGelu", - "BiasGelu", - "LayerNormalization", - "QOrderedLayerNormalization", - "SkipLayerNormalization", - "QOrderedMatMul", - ] - for op in ops: - nodes = self.get_nodes_by_op_type(op) - op_count[op] = len(nodes) - logger.info(f"Optimized operators:{op_count}") - return op_count - - def is_fully_optimized(self): - """ - Returns True when the model is fully optimized. - """ - op_count = self.get_fused_operator_statistics() - embed = op_count["EmbedLayerNormalization"] - attention = op_count["Attention"] + op_count["QOrderedAttention"] - gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"] - layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"] - is_perfect = ( - (embed > 0) - and (attention > 0) - and (attention == gelu) - and (layer_norm >= 2 * attention) - ) - - if layer_norm == 0: - logger.debug("Layer Normalization not fused") - - if gelu == 0: - logger.debug("Gelu/FastGelu not fused") - - if embed == 0: - logger.debug("Embed Layer not fused") - - if attention == 0: - logger.warning("Attention not fused") - - return is_perfect diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py deleted file mode 100644 index dac070d24a66812c4b14cfeff5b7c78ff44c6711..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py +++ /dev/null @@ -1,550 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -import onnx -from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper -from passes.fusion_attention import AttentionMask, FusionAttention -from passes.fusion_biasgelu import FusionBiasGelu -from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation -from passes.fusion_embedlayer import FusionEmbedLayerNormalization -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_qordered_matmul import FusionQOrderedMatMul -from passes.fusion_reshape import FusionReshape -from passes.fusion_rms_norm import FusionRMSNorm -from passes.fusion_shape import FusionShape -from passes.fusion_skiplayernorm import ( - FusionBiasSkipLayerNormalization, - FusionSkipLayerNormalization, -) -from passes.fusion_splitQKV_update_KVcache import FusionSplitQKVUpdateKVCache -from passes.fusion_t5_attention import ( - FusionT5DecoderAttention, - FusionT5EncoderAttention, -) -from passes.fusion_utils import FusionUtils -from passes.onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class BertOptimizationOptions(FusionOptions): - """This class is deprecated""" - - def __init__(self, model_type): - logger.warning( - f"BertOptimizationOptions is depreciated. Please use FusionOptions instead." - ) - super().__init__(model_type) - - -class T5OnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads=12, hidden_size=768): - """Initialize T5 ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - - super().__init__(model) - self.num_heads = num_heads - self.hidden_size = hidden_size - self.attention_mask = AttentionMask(self) - self.attention_fusion = FusionAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.qordered_attention_fusion = FusionQOrderedAttention( - self, self.hidden_size, self.num_heads, self.attention_mask - ) - self.utils = FusionUtils(self) - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_activation(self): - fusion = FusionCustomFCActivation(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_bias_gelu(self, is_fastgelu): - fusion = FusionBiasGelu(self, is_fastgelu) - fusion.apply() - - def gelu_approximation(self): - fusion = FusionGeluApproximation(self) - fusion.apply() - - def fuse_add_bias_skip_layer_norm(self): - fusion = FusionBiasSkipLayerNormalization(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_embed_layer(self): - fusion = FusionEmbedLayerNormalization(self) - fusion.apply() - - def fuse_rms_norm(self): - fusion = FusionRMSNorm(self) - fusion.apply() - - def fuse_t5_encoder_attention(self): - fusion = FusionT5EncoderAttention(self) - fusion.apply() - - def fuse_t5_decoder_attention(self): - fusion = FusionT5DecoderAttention(self) - fusion.apply() - # pass - - def fuse_layer_norm(self): - fusion = FusionLayerNormalization(self, hidden_size=768) - fusion.apply() - - fusion = FusionLayerNormalizationTF(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedLayerNormalization(self) - fusion.apply() - - def fuse_skip_layer_norm(self): - fusion = FusionSkipLayerNormalization(self) - fusion.apply() - - def fuse_splitQKV_update_kv_cache(self): - fusion = FusionSplitQKVUpdateKVCache(self, self.hidden_size, self.num_heads) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - def fuse_qordered_mamtul(self): - fusion = FusionQOrderedMatMul(self) - fusion.apply() - - def get_graph_inputs_from_node_type( - self, op_type: str, input_indices: List[int], casted: bool - ): - """ - Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). - Returns a list of the graph input names based on the filter whether it is casted or not. - """ - graph_inputs = [] - - output_name_to_node = self.output_name_to_node() - nodes = self.get_nodes_by_op_type(op_type) - for node in nodes: - bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)] - for bert_input in bert_inputs: - if self.find_graph_input(bert_input): - if not casted: - graph_inputs.append(bert_input) - elif bert_input in output_name_to_node: - parent = output_name_to_node[bert_input] - if ( - parent.op_type == "Cast" - and self.find_graph_input(parent.input[0]) is not None - ): - if casted: - graph_inputs.append(parent.input[0]) - return graph_inputs - - def get_graph_inputs_from_fused_nodes(self, casted: bool): - inputs = self.get_graph_inputs_from_node_type( - "EmbedLayerNormalization", [0, 1, 7], casted - ) - inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted) - return inputs - - def change_graph_input_type( - self, - graph: GraphProto, - graph_input: ValueInfoProto, - new_type: int = TensorProto.INT32, - ): - """Change graph input type, and add Cast node if needed. - - Args: - graph (GraphProto): graph - graph_input (TensorProto): input of the graph - new_type (int, optional): new data type. Defaults to TensorProto.INT32. - - Returns: - NodeProto: a new Cast node that added. None if Cast node is not added. - List[NodeProto]: Cast nodes that have been removed. - """ - assert isinstance(graph, GraphProto) - assert isinstance(graph_input, ValueInfoProto) - assert self.find_graph_input(graph_input.name) - - if graph_input.type.tensor_type.elem_type == int(new_type): - return None, [] - - new_cast_node = None - nodes_to_remove = [] - - input_name_to_nodes = self.input_name_to_nodes() - if graph_input.name in input_name_to_nodes: - nodes = input_name_to_nodes[graph_input.name] - - # For children that is not Cast node, insert a Cast node to convert int32 to original data type. - nodes_not_cast = [node for node in nodes if node.op_type != "Cast"] - if nodes_not_cast: - node_name = self.create_node_name("Cast") - output_name = node_name + "_" + graph_input.name - new_value_info = graph.value_info.add() - new_value_info.CopyFrom(graph_input) - new_value_info.name = output_name - new_cast_node = helper.make_node( - "Cast", - [graph_input.name], - [output_name], - to=int(graph_input.type.tensor_type.elem_type), - name=node_name, - ) - graph.node.extend([new_cast_node]) - - for node in nodes_not_cast: - OnnxModel.replace_node_input(node, graph_input.name, output_name) - - # For children that is Cast node, no need to insert Cast. - # When the children is Cast to int32, we can remove that Cast node since input type is int32 now. - nodes_cast = [node for node in nodes if node.op_type == "Cast"] - for node in nodes_cast: - if OnnxModel.get_node_attribute(node, "to") == int(new_type): - self.replace_input_of_all_nodes(node.output[0], graph_input.name) - if not self.find_graph_output(node.output[0]): - nodes_to_remove.append(node) - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - - graph_input.type.tensor_type.elem_type = int(new_type) - return new_cast_node, nodes_to_remove - - def change_graph_inputs_to_int32(self): - """Change data type of all graph inputs to int32 type, and add Cast node if needed.""" - graph = self.graph() - add_cast_count = 0 - remove_cast_count = 0 - for graph_input in graph.input: - new_node, removed_nodes = self.change_graph_input_type( - graph, graph_input, TensorProto.INT32 - ) - if new_node: - add_cast_count += 1 - remove_cast_count += len(removed_nodes) - logger.info( - f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes." - ) - - def use_dynamic_axes( - self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len" - ): - """ - Update input and output shape to use dynamic axes. - """ - bert_graph_inputs = self.get_graph_inputs_from_fused_nodes( - casted=True - ) + self.get_graph_inputs_from_fused_nodes(casted=False) - - dynamic_batch_inputs = {} - for input in self.model.graph.input: - if input.name in bert_graph_inputs: - dim_proto = input.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - if dynamic_seq_len is not None: - dim_proto = input.type.tensor_type.shape.dim[1] - dim_proto.dim_param = dynamic_seq_len - - for output in self.model.graph.output: - dim_proto = output.type.tensor_type.shape.dim[0] - dim_proto.dim_param = dynamic_batch_dim - - def preprocess(self): - self.adjust_reshape_and_expand() - return - - def adjust_reshape_and_expand(self): - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Reshape": - # Clean up unneccessary reshape nodes. - # Find reshape nodes with no actually data in "shape" attribute and remove. - reshape_shape = self.get_constant_value(node.input[1]) - if reshape_shape is not None and reshape_shape.size == 0: - nodes_to_remove.extend([node]) - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - continue - - # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by - # changing current reshape's input to output of slice. - reshape_path = self.match_parent_path( - node, - ["Expand", "Expand", "Reshape", "Slice"], - [0, 0, 0, 0], - self.output_name_to_node(), - ) - if reshape_path is not None: - expand_node = reshape_path[-3] - expand_shape_value = self.get_constant_value(expand_node.input[1]) - - reshape_before_expand = reshape_path[-2] - shape_value = self.get_constant_value( - reshape_before_expand.input[1] - ) - - slice_node = reshape_path[-1] - if ( - expand_shape_value is not None - and shape_value is not None - and len(expand_shape_value) == 2 - and len(shape_value) == 1 - and expand_shape_value[1] == shape_value[0] - ): - node.input[0] = slice_node.output[0] - - if nodes_to_remove: - self.remove_nodes(nodes_to_remove) - logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}") - - def clean_graph(self): - output_name_to_node = self.output_name_to_node() - nodes_to_remove = [] - for node in self.nodes(): - # Before: - # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+ - # | | - # | v - # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # After: - # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum - # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value) - op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3} - if node.op_type in op_input_id: - i = op_input_id[node.op_type] - parent_nodes = self.match_parent_path( - node, - [ - "Cast", - "ConstantOfShape", - "Concat", - "Unsqueeze", - "Gather", - "Shape", - ], - [i, 0, 0, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - ( - cast, - constantOfShape, - concat, - unsqueeze, - gather, - shape, - ) = parent_nodes - if shape.input[0] == self.graph().input[0].name: - constantOfShape.input[0] = shape.output[0] - output_name_to_node = self.output_name_to_node() - - if node.op_type == "Attention": - # Before: - # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention - # After: - # remove this path, and remove the optional mask_index input of Attention node. - parent_nodes = self.match_parent_path( - node, - ["ReduceSum", "Cast", "ConstantOfShape", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if parent_nodes is not None: - if parent_nodes[-1].input[0] == self.graph().input[0].name: - attention_node = helper.make_node( - "Attention", - inputs=node.input[0 : len(node.input) - 1], - outputs=node.output, - name=node.name + "_remove_mask", - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [helper.make_attribute("num_heads", self.num_heads)] - ) - self.add_node( - attention_node, self.get_graph_by_node(attention_node).name - ) - nodes_to_remove.append(node) - self.remove_nodes(nodes_to_remove) - - def postprocess(self): - self.clean_graph() - self.prune_graph() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.preprocess() - - self.fuse_reshape() - - if (options is None) or options.enable_skip_layer_norm: - self.fuse_skip_layer_norm() - - # Perform the MatMul fusion after the Attention fusion as we do not - # want to fuse the MatMuls inside the Attention subgraphs - if (options is None) or options.enable_qordered_matmul: - self.fuse_qordered_mamtul() - - self.fuse_shape() - - self.fuse_rms_norm() - - self.fuse_t5_encoder_attention() - - self.fuse_t5_decoder_attention() - - self.fuse_splitQKV_update_kv_cache() - - if (options is None) or options.enable_embed_layer_norm: - self.fuse_embed_layer() - - # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. - self.utils.remove_useless_reshape_nodes() - - self.postprocess() - - # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization - if (options is None) or options.enable_bias_gelu: - # Fuse Gelu and Add Bias before it. - self.fuse_bias_gelu(is_fastgelu=True) - self.fuse_bias_gelu(is_fastgelu=False) - - if (options is None) or options.enable_bias_skip_layer_norm: - # Fuse SkipLayerNormalization and Add Bias before it. - self.fuse_add_bias_skip_layer_norm() - - if options is not None and options.enable_gelu_approximation: - self.gelu_approximation() - - self.remove_unused_constant() - - # Use symbolic batch dimension in input and output. - if add_dynamic_axes: - self.use_dynamic_axes() - - logger.info(f"opset version: {self.get_opset_version()}") - - def get_fused_operator_statistics(self): - """ - Returns node count of fused operators. - """ - op_count = {} - ops = [ - "EmbedLayerNormalization", - "Attention", - "QOrderedAttention", - "Gelu", - "QOrderedGelu", - "FastGelu", - "BiasGelu", - "LayerNormalization", - "QOrderedLayerNormalization", - "SkipLayerNormalization", - "QOrderedMatMul", - ] - for op in ops: - nodes = self.get_nodes_by_op_type(op) - op_count[op] = len(nodes) - logger.info(f"Optimized operators:{op_count}") - return op_count - - def is_fully_optimized(self): - """ - Returns True when the model is fully optimized. - """ - op_count = self.get_fused_operator_statistics() - embed = op_count["EmbedLayerNormalization"] - attention = op_count["Attention"] + op_count["QOrderedAttention"] - gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"] - layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"] - is_perfect = ( - (embed > 0) - and (attention > 0) - and (attention == gelu) - and (layer_norm >= 2 * attention) - ) - - if layer_norm == 0: - logger.debug("Layer Normalization not fused") - - if gelu == 0: - logger.debug("Gelu/FastGelu not fused") - - if embed == 0: - logger.debug("Embed Layer not fused") - - if attention == 0: - logger.warning("Attention not fused") - - return is_perfect diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py deleted file mode 100644 index 42b504c42edfc006b5efac0d385001780d296fb2..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Optional - -from onnx import ModelProto -from passes.fuse_series_bias_add import FusionSerialBiasAdd -from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation -from passes.fusion_fastgelu import FusionFastGelu -from passes.fusion_format_roformer import ( - FusionFormatInvalidMask, - FusionRemoveUselessElementwise, -) -from passes.fusion_gelu import FusionGelu -from passes.fusion_gelu_approximation import FusionGeluApproximation -from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF -from passes.fusion_options import FusionOptions -from passes.fusion_qordered_attention import FusionQOrderedAttention -from passes.fusion_qordered_gelu import FusionQOrderedGelu -from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization -from passes.fusion_reshape import FusionReshape -from passes.fusion_shape import FusionShape -from passes.fusion_utils import FusionUtils -from passes.fusion_yolov5_decoder import FusionYoloV5Decoder -from passes.onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class YoloOnnxModel(OnnxModel): - def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): - """Initialize BERT ONNX Model. - - Args: - model (ModelProto): the ONNX model - num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). - hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). - """ - assert (num_heads == 0 and hidden_size == 0) or ( - num_heads > 0 and hidden_size % num_heads == 0 - ) - super().__init__(model) - self.utils = FusionUtils(self) - - def fuse_format_roformer(self): - FusionRemoveUselessElementwise(self).apply() - fusion = FusionFormatInvalidMask(self) - fusion.apply() - - def fuse_custom_fc(self): - fusion = FusionCustomFC(self) - fusion.apply() - - def fuse_custom_fc_activation(self): - fusion = FusionCustomFCActivation(self) - fusion.apply() - - def fuse_swinT_serial_bias_add(self): - fusion = FusionSerialBiasAdd(self) - fusion.apply() - - def fuse_gelu(self): - fusion = FusionGelu(self) - fusion.apply() - fusion = FusionFastGelu(self) - fusion.apply() - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedGelu(self) - fusion.apply() - - def fuse_reshape(self): - fusion = FusionReshape(self) - fusion.apply() - - def fuse_shape(self): - fusion = FusionShape(self) - fusion.apply() - - def fuse_layer_norm(self): - fusion = FusionLayerNormalization(self, 0) - fusion.apply() - - fusion = FusionLayerNormalizationTF(self) - fusion.apply() - - # Only relevant in models with Q-DQ nodes - fusion = FusionQOrderedLayerNormalization(self) - fusion.apply() - - def optimize( - self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False - ): - if (options is not None) and not options.enable_shape_inference: - self.disable_shape_inference() - - self.utils.remove_identity_nodes() - - # Remove cast nodes that having same data type of input and output based on symbolic shape inference. - self.utils.remove_useless_cast_nodes() - - if (options is None) or options.enable_layer_norm: - self.fuse_layer_norm() - - if (options is None) or options.enable_gelu: - self.fuse_gelu() - - self.fuse_reshape() - - FusionYoloV5Decoder(self).apply() - self.remove_unused_constant() - logger.info(f"opset version: {self.get_opset_version()}") diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md deleted file mode 100644 index dc823d366b327141bd5646e7d3aef153349cea8e..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md +++ /dev/null @@ -1,51 +0,0 @@ -# IxRT optimizer - -## 1. optimizer 简介 -`optimizer` 是一个 ixrt 中集成的图融合工具,用于将onnx图中的op融合成对应的ixrt plugin; - -## 2. optimizer 功能说明 -| 功能 | 说明 | -| -------------- | ---- | -| 多 batchsize 支持 | 支持设置不同 batchsize 进行推理测试 | -| 动态图支持 | 支持融合动态图和静态图 | -| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert等模型 | - -## 3. optimizer 运行参数 -| 参数 | 说明 | -| -------------- | ---- | -| `--onnx` | 必选 ,指定要运行的 onnx 模型路径 | -| `--num_heads` | 可选 ,指定模型对应Attention模块注意力头的个数 | -|`--hidden_size` | 可选, 模型模型隐藏层的大小| -|`--input_shapes` | 可选 ,指定模型输入数据类型,示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 | -| `--dump_onnx` | 可选 ,用于图融合过程中dump出中间的onnx图 | -|`--model_type` | 可选 ,可以指定要融合的模型类型,默认是"bert", 可选["bert", "swint", "roformer"]| -|`--log_level` |可选 ,指定ixrt运行时显示日志的等级, 可指定为debug、info、error,默认为 info| - - -## 4. 运行示例 - -### 4.1 示例1:融合albert|videobert|roberta|deberta -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} -``` - -### 4.2 示例2:融合swinL -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint -``` - -### 4.3 示例3:融合roformer -```bash -cd oss/tools/optimizer -python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer -``` - -### 4.4 精度验证 - -请参考[高级话题](5_advanced_topics.md)中的精度对比工具一节,了解详细使用方法和原理。 - -也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md) - -具体使用方法可以参考oss/samples diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py deleted file mode 100644 index 0f301e3a58e14713c7ebb26342a6fb39ecdca80e..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -import argparse -import logging -import time -from typing import Dict, Optional - -import onnx -from onnx import ModelProto, helper, load_model -from onnx_model_bert import BertOnnxModel -from onnx_model_roformer import RoformerOnnxModel -from onnx_model_conformer import conformerOnnxModel -from onnx_model_t5 import T5OnnxModel -from onnx_model_yolo import YoloOnnxModel -from onnx_model_PVT import PVTOnnxModel -from onnx_model_cosyvoice import cosyvoiceOnnxModel - - -from onnxsim import simplify -from passes.fusion_options import FusionOptions -from passes.symbolic_shape_infer import SymbolicShapeInference - -logger = logging.getLogger(__name__) -MODEL_TYPES = { - "bert": (BertOnnxModel, None, "pytorch", 1), - "swint": (BertOnnxModel, None, "pytorch", 1), - "roformer": (RoformerOnnxModel, None, "tf2onnx", 1), - "gpt2": (BertOnnxModel, None, "pytorch", 1), - "t5": (T5OnnxModel, None, "tf2onnx", 1), - "yolo": (YoloOnnxModel, None, "pytorch", 1), - "vit": (BertOnnxModel, None, "pytorch", 1), - "conformer": (conformerOnnxModel, None, "pytorch", 1), - "PVT": (PVTOnnxModel, None, "pytorch", 1), - "omdet": (BertOnnxModel, None, "pytorch", 1), - "cosyvoice": (cosyvoiceOnnxModel, None, "pytorch", 1) - -} - - -def optimize_by_fusion( - model: ModelProto, - model_type: str = "bert", - num_heads: int = 0, - hidden_size: int = 0, - optimization_options: Optional[FusionOptions] = None, -): - """Optimize Model by graph fusion logic. - - Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable - constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model. - - For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters. - - Args: - model (ModelProto): model object - model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'. - num_heads (int, optional): number of attention heads. Defaults to 0. - 0 allows detect the parameter from graph automatically (for model_type "bert" only). - hidden_size (int, optional): hidden size. Defaults to 0. - 0 allows detect the parameter from graph automatically (for model_type "bert" only). - optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None. - - Returns: - object of an optimizer class. - """ - if model_type != "bert" and (num_heads == 0 or hidden_size == 0): - logger.warning( - "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'" - ) - - (optimizer_class, transformer_class, producer, _) = MODEL_TYPES[model_type] - - if model.producer_name and producer != model.producer_name: - logger.warning( - f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".' - "Please specify correct --model_type parameter." - ) - - if optimization_options is None: - optimization_options = FusionOptions(model_type) - - optimizer = optimizer_class(model, num_heads, hidden_size) - - optimizer.optimize(optimization_options) - - optimizer.topological_sort() - - return optimizer, transformer_class - - -def optimize_to_ixrt(args): - onnx_name = args.onnx[:-5] - model = onnx.load(args.onnx) - if not args.not_sim: - logger.info("simplify..") - simplified_model, check = simplify(model) - logger.info("simplify model end...") - if args.dump_onnx: - onnx.save(simplified_model, onnx_name + "_sim.onnx") - - # transfer to static shape and optimize it - static_sim_model = simplified_model - if args.input_shapes: - for input_tensor in simplified_model.graph.input: - if input_tensor.name in args.input_shapes.keys(): - new_shape = args.input_shapes[input_tensor.name] - dim_list = [] - for dim in new_shape: - if isinstance(dim, int): - dim_proto = onnx.TensorShapeProto.Dimension() - dim_proto.dim_value = dim - dim_list.append(dim_proto) - elif isinstance(dim, str): - dim_proto = onnx.TensorShapeProto.Dimension() - dim_proto.dim_param = dim - dim_list.append(dim_proto) - - del input_tensor.type.tensor_type.shape.dim[:] - input_tensor.type.tensor_type.shape.dim.extend(dim_list) - - try: - auto_merge = False - if args.model_type in ["roformer"]: - auto_merge = True - static_model = SymbolicShapeInference.infer_shapes( - simplified_model, 2**31 - 1, auto_merge, False, 3 - ) - static_sim_model, check = simplify(static_model) - if args.dump_onnx: - onnx.save(static_sim_model, onnx_name + "_sim_static_sim.onnx") - except Exception as e: - static_model = static_sim_model = simplified_model - - if args.dump_onnx: - onnx.save(static_model, onnx_name + "_sim_static.onnx") - if args.not_sim: - static_sim_model = model - - logger.info("start fusion..") - opt_model, _ = optimize_by_fusion( - static_sim_model, args.model_type, args.num_heads, args.hidden_size - ) - opt_model.save_model_to_file(onnx_name + "_end.onnx") - logger.info("done..") - - -def parse_params(params_str): - params = {} - for item in params_str.replace(" ", "").split(","): - key, value = item.split(":") - params[key] = [int(x) if x.isdigit() else x for x in value.split("x")] - return params - - -def args_parser(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--onnx", type=str, default=None, required=False, help="ONNX model file path" - ) - parser.add_argument( - "--num_heads", - type=int, - default=0, - help="Used in model optimization. The num of the head used in the network", - ) - parser.add_argument( - "--hidden_size", - type=int, - default=0, - help="Used in model optimization. The hidden_size used in the network", - ) - parser.add_argument( - "--input_shapes", - type=parse_params, - help='Static input_shapes to the inference, format is --input_shapes "input_name1:3x224x224, input_name2:3x224x224"', - ) - parser.add_argument( - "--dump_onnx", - action="store_true", - help="Whether to dump onnx", - ) - parser.add_argument( - "--model_type", - type=str, - default="bert", - choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2", "vit", "conformer","PVT","omdet","cosyvoice"], - help="Which kind of model to optimize", - ) - parser.add_argument( - "--log_level", - type=str, - default="info", - choices=["debug", "info", "error"], - help="Which kind of model to optimize", - ) - - parser.add_argument( - "--not_sim", - action="store_true", - default=False, - help="simplify model or not", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = args_parser() - if args.log_level == "info": - logging.basicConfig(level=logging.INFO) - elif args.log_level == "debug": - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.ERROR) - optimize_to_ixrt(args) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py deleted file mode 100644 index de522e5b082b122a28b0a0423a40909598aa82d5..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py deleted file mode 100644 index 96da8751b0200bb8610e3dd5070f26ebc51e97ac..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py +++ /dev/null @@ -1,477 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py -# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option. - -import itertools -import logging -from typing import Dict, List - -import numpy as np -import onnx -from onnx import helper, numpy_helper -from onnx import onnx_pb as onnx_proto -from packaging import version - -logger = logging.getLogger(__name__) - - -def _npfloat16_to_int(np_list): - """ - Convert numpy float16 to python int. - - :param np_list: numpy float16 list - :return int_list: python int list - """ - return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list] - - -def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0): - """ - Convert float32 numpy array to float16 without changing sign or finiteness. - Positive values less than min_positive_val are mapped to min_positive_val. - Positive finite values greater than max_finite_val are mapped to max_finite_val. - Similar for negative values. NaN, 0, inf, and -inf are unchanged. - """ - - def between(a, b, c): - return np.logical_and(a < b, b < c) - - np_array = np.where( - between(0, np_array, min_positive_val), min_positive_val, np_array - ) - np_array = np.where( - between(-min_positive_val, np_array, 0), -min_positive_val, np_array - ) - np_array = np.where( - between(max_finite_val, np_array, float("inf")), max_finite_val, np_array - ) - np_array = np.where( - between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array - ) - return np.float16(np_array) - - -def convert_tensor_float_to_float16( - tensor, min_positive_val=5.96e-08, max_finite_val=65504.0 -): - """Convert tensor float to float16. - - Args: - tensor (TensorProto): the tensor to convert. - min_positive_val (float, optional): minimal positive value. Defaults to 1e-7. - max_finite_val (float, optional): maximal finite value. Defaults to 1e4. - - Raises: - ValueError: input type is not TensorProto. - - Returns: - TensorProto: the converted tensor. - """ - - if not isinstance(tensor, onnx_proto.TensorProto): - raise ValueError( - "Expected input type is an ONNX TensorProto but got %s" % type(tensor) - ) - - if tensor.data_type == onnx_proto.TensorProto.FLOAT: - tensor.data_type = onnx_proto.TensorProto.FLOAT16 - # convert float_data (float type) to float16 and write to int32_data - if tensor.float_data: - float16_data = convert_np_to_float16( - np.array(tensor.float_data), min_positive_val, max_finite_val - ) - int_list = _npfloat16_to_int(float16_data) - tensor.int32_data[:] = int_list - tensor.float_data[:] = [] - # convert raw_data (bytes type) - if tensor.raw_data: - # convert n.raw_data to float - float32_list = np.frombuffer(tensor.raw_data, dtype="float32") - # convert float to float16 - float16_list = convert_np_to_float16( - float32_list, min_positive_val, max_finite_val - ) - # convert float16 to bytes and write back to raw_data - tensor.raw_data = float16_list.tobytes() - return tensor - - -def make_value_info_from_tensor(tensor): - shape = numpy_helper.to_array(tensor).shape - return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape) - - -DEFAULT_OP_BLOCK_LIST = [ - "ArrayFeatureExtractor", - "Binarizer", - "CastMap", - "CategoryMapper", - "DictVectorizer", - "FeatureVectorizer", - "Imputer", - "LabelEncoder", - "LinearClassifier", - "LinearRegressor", - "Normalizer", - "OneHotEncoder", - "SVMClassifier", - "SVMRegressor", - "Scaler", - "TreeEnsembleClassifier", - "TreeEnsembleRegressor", - "ZipMap", - "NonMaxSuppression", - "TopK", - "RoiAlign", - "Resize", - "Range", - "CumSum", - "Min", - "Max", - "Upsample", -] - - -class InitializerTracker: - """Class for keeping track of initializer.""" - - def __init__(self, initializer: onnx_proto.TensorProto): - self.initializer = initializer - self.fp32_nodes = [] - self.fp16_nodes = [] - - def add_node(self, node: onnx_proto.NodeProto, is_node_blocked): - if is_node_blocked: - self.fp32_nodes.append(node) - else: - self.fp16_nodes.append(node) - - -def convert_float_to_float16( - model, - min_positive_val=5.96e-08, - max_finite_val=65504.0, - keep_io_types=False, - disable_shape_infer=False, - op_block_list=None, - node_block_list=None, - force_fp16_initializers=False, -): - """Convert model tensor float type in the ONNX ModelProto input to tensor float16. - - Args: - model (ModelProto): The ONNX model to convert. - min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08. - max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504. - keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names. - If True, model inputs/outputs should be left as float32. Defaults to False. - disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False. - op_block_list (List[str], optional): List of op types to leave as float32. - Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default. - node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None. - force_fp16_initializers(bool): force converting all float initializers to float16. - Default to false, which will convert only the one needed to avoid precision loss. - Raises: - ValueError: input type is not ModelProto. - - Returns: - ModelProto: converted model. - """ - assert ( - min_positive_val >= 5.96e-08 - ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05" - assert max_finite_val <= float( - np.finfo(np.float16).max - ), "invalid max_finite_val. largest float16 value: 65504" - - func_infer_shape = None - if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse( - "1.2.0" - ): - try: - from onnx.shape_inference import infer_shapes - - func_infer_shape = infer_shapes - finally: - pass - - if not isinstance(model, onnx_proto.ModelProto): - raise ValueError( - "Expected model type is an ONNX ModelProto but got %s" % type(model) - ) - - # create blocklists - if op_block_list is None: - op_block_list = DEFAULT_OP_BLOCK_LIST - if node_block_list is None: - node_block_list = [] - op_block_list = set(op_block_list) - node_block_list = set(node_block_list) - - logger.debug( - f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}" - ) - - # create a queue for BFS - queue = [] - value_info_list = [] - node_list = [] - # type inference on input model - if func_infer_shape is not None: - model = func_infer_shape(model) - queue.append(model) - name_mapping = {} - graph_io_to_skip = set() - io_casts = set() - - fp32_inputs = [ - n.name - for n in model.graph.input - if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT - ] - fp32_outputs = [ - n.name - for n in model.graph.output - if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT - ] - if isinstance(keep_io_types, list): - fp32_inputs = [n for n in fp32_inputs if n in keep_io_types] - fp32_outputs = [n for n in fp32_outputs if n in keep_io_types] - elif not keep_io_types: - fp32_inputs = [] - fp32_outputs = [] - - for i, n in enumerate(model.graph.input): - if n.name in fp32_inputs: - output_name = "graph_input_cast_" + str(i) - name_mapping[n.name] = output_name - graph_io_to_skip.add(n.name) - - node_name = "graph_input_cast" + str(i) - new_value_info = model.graph.value_info.add() - new_value_info.CopyFrom(n) - new_value_info.name = output_name - new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 - # add Cast node (from tensor(float) to tensor(float16) after graph input - new_node = [ - helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name) - ] - model.graph.node.extend(new_node) - value_info_list.append(new_value_info) - io_casts.add(node_name) - - for i, n in enumerate(model.graph.output): - if n.name in fp32_outputs: - input_name = "graph_output_cast_" + str(i) - name_mapping[n.name] = input_name - graph_io_to_skip.add(n.name) - - node_name = "graph_output_cast" + str(i) - # add Cast node (from tensor(float16) to tensor(float) before graph output - new_value_info = model.graph.value_info.add() - new_value_info.CopyFrom(n) - new_value_info.name = input_name - new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 - new_node = [ - helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name) - ] - model.graph.node.extend(new_node) - value_info_list.append(new_value_info) - io_casts.add(node_name) - - fp32_initializers: Dict[str, InitializerTracker] = {} - while queue: - next_level = [] - for q in queue: - # if q is model, push q.graph (GraphProto) - if isinstance(q, onnx_proto.ModelProto): - next_level.append(q.graph) - # if q is model.graph, push q.node.attribute (AttributeProto) - if isinstance(q, onnx_proto.GraphProto): - for n in q.initializer: # TensorProto type - if n.data_type == onnx_proto.TensorProto.FLOAT: - assert n.name not in fp32_initializers - fp32_initializers[n.name] = InitializerTracker(n) - - for n in q.node: - # if n is in the block list (doesn't support float16), no conversion for the node, - # and save the node for further processing - if n.name in io_casts: - continue - for i in range(len(n.input)): - if n.input[i] in name_mapping: - n.input[i] = name_mapping[n.input[i]] - for i in range(len(n.output)): - if n.output[i] in name_mapping: - n.output[i] = name_mapping[n.output[i]] - - is_node_blocked = ( - n.op_type in op_block_list or n.name in node_block_list - ) - for input in n.input: - if input in fp32_initializers: - fp32_initializers[input].add_node(n, is_node_blocked) - - if is_node_blocked: - node_list.append(n) - else: - if n.op_type == "Cast": - for attr in n.attribute: - if attr.name == "to" and attr.i == 1: - attr.i = 10 - break - for attr in n.attribute: - next_level.append(attr) - # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto) - # and process node.attribute.t and node.attribute.tensors (TensorProto) - if isinstance(q, onnx_proto.AttributeProto): - next_level.append(q.g) - for n in q.graphs: - next_level.append(n) - q.t.CopyFrom( - convert_tensor_float_to_float16( - q.t, min_positive_val, max_finite_val - ) - ) - for n in q.tensors: - n = convert_tensor_float_to_float16( - n, min_positive_val, max_finite_val - ) - # if q is graph, process input, output and value_info (ValueInfoProto) - if isinstance(q, onnx_proto.GraphProto): - # Note that float initializers tracked by fp32_initializers will be processed later. - # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to - # tensor(float16) except map and seq(map). And save them in value_info_list for further processing - for n in itertools.chain(q.input, q.output, q.value_info): - if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT: - if n.name not in graph_io_to_skip: - n.type.tensor_type.elem_type = ( - onnx_proto.TensorProto.FLOAT16 - ) - value_info_list.append(n) - if n.type.HasField("sequence_type"): - if ( - n.type.sequence_type.elem_type.tensor_type.elem_type - == onnx_proto.TensorProto.FLOAT - ): - if n.name not in graph_io_to_skip: - n.type.sequence_type.elem_type.tensor_type.elem_type = ( - onnx_proto.TensorProto.FLOAT16 - ) - value_info_list.append(n) - - queue = next_level - - for key, value in fp32_initializers.items(): - # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes. - if force_fp16_initializers or value.fp16_nodes: - value.initializer = convert_tensor_float_to_float16( - value.initializer, min_positive_val, max_finite_val - ) - value_info_list.append(make_value_info_from_tensor(value.initializer)) - if value.fp32_nodes and not force_fp16_initializers: - logger.info( - "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format( - value.fp16_nodes - ) - ) - - # process the nodes in block list that doesn't support tensor(float16) - for node in node_list: - # if input's name is in the value_info_list meaning input is tensor(float16) type, - # insert a float16 to float Cast node before the node, - # change current node's input name and create new value_info for the new name - for i in range(len(node.input)): - input = node.input[i] - for value_info in value_info_list: - if input == value_info.name: - # create new value_info for current node's new input name - new_value_info = model.graph.value_info.add() - new_value_info.CopyFrom(value_info) - output_name = node.name + "_input_cast_" + str(i) - new_value_info.name = output_name - new_value_info.type.tensor_type.elem_type = ( - onnx_proto.TensorProto.FLOAT - ) - # add Cast node (from tensor(float16) to tensor(float) before current node - node_name = node.name + "_input_cast" + str(i) - new_node = [ - helper.make_node( - "Cast", [input], [output_name], to=1, name=node_name - ) - ] - model.graph.node.extend(new_node) - # change current node's input name - node.input[i] = output_name - break - # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to - # float16 Cast node after the node, change current node's output name and create new value_info for the new name - for i in range(len(node.output)): - output = node.output[i] - for value_info in value_info_list: - if output == value_info.name: - # create new value_info for current node's new output - new_value_info = model.graph.value_info.add() - new_value_info.CopyFrom(value_info) - input_name = node.name + "_output_cast_" + str(i) - new_value_info.name = input_name - new_value_info.type.tensor_type.elem_type = ( - onnx_proto.TensorProto.FLOAT - ) - # add Cast node (from tensor(float) to tensor(float16) after current node - node_name = node.name + "_output_cast" + str(i) - new_node = [ - helper.make_node( - "Cast", [input_name], [output], to=10, name=node_name - ) - ] - model.graph.node.extend(new_node) - # change current node's input name - node.output[i] = input_name - break - return model - - -def float_to_float16_max_diff( - tensor, min_positive_val=5.96e-08, max_finite_val=65504.0 -): - """Measure the maximum absolute difference after converting a float tensor to float16.""" - if not isinstance(tensor, onnx_proto.TensorProto): - raise ValueError( - "Expected input type is an ONNX TensorProto but got %s" % type(tensor) - ) - if tensor.data_type != onnx_proto.TensorProto.FLOAT: - raise ValueError("Expected tensor data type is float.") - - float32_data = None - if tensor.float_data: - float32_data = np.array(tensor.float_data) - - if tensor.raw_data: - float32_data = np.frombuffer(tensor.raw_data, dtype="float32") - - if float32_data is None: - raise RuntimeError("external data not loaded!") - - float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val) - return np.amax(np.abs(float32_data - np.float32(float16_data))) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py deleted file mode 100644 index 9862d9ee4bee8da619750b2544ddc48d35be0fa9..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py +++ /dev/null @@ -1,85 +0,0 @@ - -from logging import getLogger -from typing import Dict - -import numpy as np -from onnx import TensorProto, helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - -class FusionLayerInverseSigmoid(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, "InverseSigmoid", "Clip" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - +------------Clip-----------+ - | | - | v - [Root] --> Clip--> Sub --> Clip --> Div --> Log - """ - children = self.model.get_children(node, input_name_to_nodes) - if len(children) != 2: - return - - root_input = node.input[0] - - if not ((children[0].op_type == "Sub" and children[1].op_type == "Clip") or (children[0].op_type == "Clip" and children[1].op_type == "Sub")): - return - - log_node = None - for child in children: - log_node = self.model.find_first_child_by_type( - child, "Log", input_name_to_nodes, recursive=True - ) - if log_node is not None: - break - if log_node is None: - return - parent_nodes = self.model.match_parent_path( - log_node, - ["Div", "Clip", "Sub", "Clip"], - [0, 1, 0, 1], - output_name_to_node, - ) - if parent_nodes is None: - return - - sub_node = parent_nodes[2] - if sub_node not in children: - return - - div_node = parent_nodes[0] - div_parents_nodes = self.model.get_parents(div_node) - if len(div_parents_nodes) != 2: - return - if div_parents_nodes[0].op_type != "Clip": - return - if div_parents_nodes[0] not in children: - return - - subgraph_nodes = [node] - subgraph_nodes.extend([log_node]) - subgraph_nodes.extend(parent_nodes) - subgraph_nodes.extend([div_parents_nodes[0]]) - _, eps_val = self.model.get_constant_input(div_parents_nodes[0]) - - self.nodes_to_remove.extend(subgraph_nodes) - inverse_sigmoid_node = helper.make_node( - "InverseSigmoid", - inputs=[node.input[0]], - outputs=[log_node.output[0]], - name=self.model.create_node_name( - "InverseSigmoid", name_prefix="InverseSigmoid" - ), - ) - inverse_sigmoid_node.attribute.extend( - [helper.make_attribute("epsilon", float(eps_val))] - ) - self.nodes_to_add.append(inverse_sigmoid_node) - self.node_name_to_graph_name[inverse_sigmoid_node.name] = self.this_graph_name \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py deleted file mode 100644 index bfd1ed28eb8b0f3d7c65b1e31da8c1dc45415ce7..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py +++ /dev/null @@ -1,69 +0,0 @@ -from logging import getLogger -from typing import Dict - -import numpy as np -from onnx import TensorProto, helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - -class FusionLayerL2Normalization(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, "L2Normalization", "Abs" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - +-------------------------------------------------------+ - | | - | v - [Root] --> Abs--> Pow --> ReduceSum --> Pow --> Clip --> Div - """ - pow1_nodes = self.model.get_children(node, input_name_to_nodes) - if len(pow1_nodes) != 1 or pow1_nodes[0].op_type != "Pow": - return - - reduce_nodes = self.model.get_children(pow1_nodes[0], input_name_to_nodes) - if len(reduce_nodes) != 1 or reduce_nodes[0].op_type != "ReduceSum": - return - - pow2_nodes = self.model.get_children(reduce_nodes[0], input_name_to_nodes) - if len(pow2_nodes) != 1 or pow2_nodes[0].op_type != "Pow": - return - - clip_nodes = self.model.get_children(pow2_nodes[0], input_name_to_nodes) - if len(clip_nodes) != 1 or clip_nodes[0].op_type != "Clip": - return - - div_nodes = self.model.get_children(clip_nodes[0], input_name_to_nodes) - if len(div_nodes) != 1 or div_nodes[0].op_type != "Div": - return - - root_input = node.input[0] - if div_nodes[0].input[0] != root_input: - return - - subgraph_nodes = [node, pow1_nodes[0], reduce_nodes[0], pow2_nodes[0], clip_nodes[0], div_nodes[0]] - _, eps_val = self.model.get_constant_input(clip_nodes[0]) - _, norm_axes = self.model.get_constant_input(reduce_nodes[0]) - norm_axes = norm_axes.astype(np.int32) - - self.nodes_to_remove.extend(subgraph_nodes) - l2_normalization_node = helper.make_node( - "L2Normalization", - inputs=[node.input[0]], - outputs=[div_nodes[0].output[0]], - name=self.model.create_node_name( - "L2Normalization", name_prefix="L2Normalization" - ), - ) - l2_normalization_node.attribute.extend( - [helper.make_attribute("epsilon", float(eps_val)), - helper.make_attribute("axes", norm_axes), - helper.make_attribute("axes_length", int(norm_axes.size))] - ) - self.nodes_to_add.append(l2_normalization_node) - self.node_name_to_graph_name[l2_normalization_node.name] = self.this_graph_name \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py deleted file mode 100644 index 3451731f835ef05d8e61e0b5da2ef724be808f17..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py +++ /dev/null @@ -1,149 +0,0 @@ - -from logging import getLogger -from typing import Dict - -import math -import numpy as np -from onnx import TensorProto, helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - -class FusionLayerOmdetAttention(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, "CustomQKVToContextPluginDynamic_IxRT", "CustomFCPluginDynamic_IxRT" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - [Root] --> CustomFCPluginDynamic_IxRT--> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT - """ - children = self.model.get_children(node, input_name_to_nodes) - parent = self.model.get_parents(node, output_name_to_node) - - if len(children) != 1: - return - if len(parent) != 1: - return - - fc_first_node = None - for par in parent: - fc_first_node = self.model.find_first_parent_by_type( - par, "CustomFCPluginDynamic_IxRT", output_name_to_node, recursive=True - ) - if fc_first_node is not None: - break - if fc_first_node is None: - return - - start_node = node - - # v path - v_nodes = self.model.match_parent_path( - start_node, - ["Reshape", "Transpose", "MatMul", "Gather", "Transpose", "Reshape"], - [0, 0, 0, 1, 0, 0], - output_name_to_node, - ) - - # path1, q and k path - q_nodes = self.model.match_parent_path( - start_node, - ["Reshape", "Transpose", "MatMul", "Softmax", "Add", "MatMul", "Transpose", "Gather", "Transpose", "Reshape"], - [0, 0, 0, 0, 0, 0, 1, 0, 0, 0], - output_name_to_node, - ) - - k_nodes = self.model.match_parent_path( - start_node, - ["Reshape", "Transpose", "MatMul", "Softmax", "Add", "MatMul", "Mul", "Gather", "Transpose", "Reshape"], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - output_name_to_node, - ) - - # path2, q and k path - q_nodes_1 = self.model.match_parent_path( - start_node, - ["Reshape", "Transpose", "MatMul", "Softmax", "Reshape", "Add", "Reshape", "Add", "MatMul", "Transpose", "Gather", "Transpose", "Reshape"], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], - output_name_to_node, - ) - - k_nodes_1 = self.model.match_parent_path( - start_node, - ["Reshape", "Transpose", "MatMul", "Softmax", "Reshape", "Add", "Reshape", "Add", "MatMul", "Mul", "Gather", "Transpose", "Reshape"], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - output_name_to_node, - ) - - if v_nodes is None: - return - - if v_nodes and q_nodes and k_nodes: - subgraph_nodes = [] - subgraph_nodes.extend(q_nodes) - subgraph_nodes.extend(k_nodes) - subgraph_nodes.extend(v_nodes) - - subgraph_nodes_unique = [] - for item in subgraph_nodes: - if item not in subgraph_nodes_unique: - subgraph_nodes_unique.append(item) - - add_node = q_nodes[4] - hidden_size = start_node.attribute[0].i - _, mul_val = self.model.get_constant_input(k_nodes[6]) - num_heads = hidden_size // math.floor((1/mul_val)*(1/ mul_val)) - attention_input_1_name = add_node.input[1] - - if v_nodes and q_nodes_1 and k_nodes_1: - subgraph_nodes = [] - subgraph_nodes.extend(q_nodes_1) - subgraph_nodes.extend(k_nodes_1) - subgraph_nodes.extend(v_nodes) - - subgraph_nodes_unique = [] - for item in subgraph_nodes: - if item not in subgraph_nodes_unique: - subgraph_nodes_unique.append(item) - - hidden_size = start_node.attribute[0].i - _, mul_val = self.model.get_constant_input(k_nodes_1[9]) - num_heads = hidden_size // math.floor((1/mul_val)*(1/ mul_val)) - - add_1 = self.model.get_initializer(q_nodes_1[5].input[1], True) - add_2 = self.model.get_initializer(q_nodes_1[7].input[1], True) - add_all = np.squeeze(add_1 + add_2) - - attention_input_1_name = "attention_" + q_nodes_1[5].input[1] - attention_input_1 = helper.make_tensor( - attention_input_1_name, TensorProto.FLOAT, add_all.shape, add_all.flatten().tolist()) - - self.model.add_initializer(attention_input_1, self.this_graph_name) - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=[fc_first_node.output[0], attention_input_1_name], - outputs=[start_node.input[0]], - name=self.model.create_node_name( - "OmdetAttention", name_prefix="OmdetAttention" - ), - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)]) - attention_node.attribute.extend([helper.make_attribute("has_mask", 1)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)]) - - self.nodes_to_remove.extend(subgraph_nodes_unique) - - self.nodes_to_add.append(attention_node) - self.node_name_to_graph_name[attention_node.name] = self.this_graph_name - - \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py deleted file mode 100644 index bb9a1cab034aaf714b416ea971ac9e6d69884894..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -from logging import getLogger - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionSerialBiasAdd(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "Add", "Softmax") - - def match_parent_path_from_dict(self, start_node, path_dict): - res_path = None - res_nodes = None - for k, v in path_dict.items(): - res_nodes = self.model.match_parent_path(start_node, v[0], v[1]) - if res_nodes is None: - continue - return res_nodes, k - return res_nodes, res_path - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - paths = { - "path1": (["Reshape", "Add", "Reshape", "Add"], [0, 0, 0, 0]), - } - series_nodes, path_chosen = self.match_parent_path_from_dict(node, paths) - if not series_nodes: - return - last_reshape, add_2nd, _, add_1st = series_nodes - - biases = [ - self.model.get_initializer(add_1st.input[1]), - self.model.get_initializer(add_2nd.input[1]), - ] - if not all(biases): - return - - bias_arr_1st = NumpyHelper.to_array(biases[0]) - bias_arr_2nd = NumpyHelper.to_array(biases[1]).squeeze(0) - try: - relative_position_bias = bias_arr_1st + bias_arr_2nd - except Exception as e: - print("Two bias are unrelated:", e) - return - - # Fuse - add_name = self.model.create_node_name("Add", "Add") - B = biases[0] - B.CopyFrom(numpy_helper.from_array(relative_position_bias, B.name)) - - fused_node = helper.make_node( - "Add", - inputs=[add_1st.input[0], B.name], - outputs=last_reshape.output, - name=add_name, - ) - fused_node.domain = "com.iluvatar" - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - self.nodes_to_add.append(fused_node) - self.nodes_to_remove.extend(series_nodes) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py deleted file mode 100644 index 2d4cc73a9dcb1c8d31d778b380bd0e8a13f454e9..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py +++ /dev/null @@ -1,130 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionPVTAttention(Fusion): - """ - Fuse FusionPVTAttention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQkvCrossToContext_IxRT", - ["Softmax"], - ) - - # Flags to show warning only once - self.num_heads_warning = False - self.hidden_size_warning = False - - - def create_decoder_attention_node( - self, inputs: str, outputs: str, type_mask: int, has_mask: int,scale: float - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - - attention_node_name = self.model.create_node_name("cross_Attention") - attention_node = helper.make_node( - "CustomQkvCrossToContext_IxRT", - inputs=inputs, - outputs=outputs, - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("scale", scale)]) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)]) - - return attention_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - """ - path: - - (query) ---------------->MatMul ---->Mul --->softmax --->MatMul---> - / / - (key) ---->Transpose --> / - / - / - / - (value)---------------------------------------------> - - """ - - start_node = node - qkv_paths = { - "path": (["Mul", "MatMul", "Transpose"], [0, 0, 0]), # cross attention qery pass - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - next_nodes = self.model.get_children(node) - if len(next_nodes) == 0: - return - - if next_nodes[0].op_type != "MatMul": - return - - second_matmul_node = next_nodes[0] - attention_outputs = second_matmul_node.output - remove_nodes = [second_matmul_node, node] - - - - (mul_node, first_matmul_node, transpose_node) = qkv_nodes - transpose_nodes = self.model.get_parents(first_matmul_node) - - q_input = transpose_nodes[0].output[0] - k_input = transpose_nodes[1].input[0] - v_input = second_matmul_node.input[1] - attention_inputs = [q_input, k_input, v_input] - remove_nodes.extend([first_matmul_node, mul_node, transpose_nodes[1]]) - - has_mask = 0 - type_mask = 4 - - scale = numpy_helper.to_array(self.model.get_initializer(mul_node.input[1])).item() - atten_node = self.create_decoder_attention_node( - attention_inputs, attention_outputs, type_mask, has_mask,scale - ) - self.nodes_to_add.append(atten_node) - self.node_name_to_graph_name[atten_node.name] = self.this_graph_name - self.nodes_to_remove.extend(remove_nodes) \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py deleted file mode 100644 index a3e31fe7dd164b86cf9e6f4e476bc0b31246e747..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py +++ /dev/null @@ -1,643 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import List, Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_attention import AttentionMask -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -def get_tensor_attr(attrs, attr_name): - result = None - for i in attrs: - if i.name == attr_name: - return numpy_helper.to_array(i.t) - return result - - -class FusionAlbertAttention(Fusion): - """ - Fuse Albert subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - hidden_size: int, - num_heads: int, - attention_mask: AttentionMask, - ): - super().__init__( - model, - "CustomQKVToContextPluginDynamic_IxRT", - ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"], - ) - self.hidden_size = hidden_size - self.num_heads = num_heads - self.attention_mask = attention_mask - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape_value = self.model.get_constant_value(reshape_q.input[1]) - if q_shape_value is None: - logger.debug(f"{reshape_q.input[1]} is not initializer.") - return self.num_heads, self.hidden_size # Fall back to user specified value - - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug( - f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return self.num_heads, self.hidden_size # Fall back to user specified value - - num_heads = q_shape_value[2] - head_size = q_shape_value[3] - hidden_size = num_heads * head_size - - if self.num_heads > 0 and num_heads != self.num_heads: - if self.num_heads_warning: - logger.warning( - f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value." - ) - self.num_heads_warning = False # Do not show the warning more than once - - if self.hidden_size > 0 and hidden_size != self.hidden_size: - if self.hidden_size_warning: - logger.warning( - f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value." - ) - self.hidden_size_warning = ( - False # Do not show the warning more than once - ) - - return num_heads, hidden_size - - def get_add_qk_str(self, add_qk: NodeProto): - shape_infer = self.model.infer_runtime_shape(update=True) - if shape_infer is None: - return - - input_0_shape = shape_infer.get_edge_shape(add_qk.input[0]) - input_1_shape = shape_infer.get_edge_shape(add_qk.input[1]) - - if input_0_shape is None or input_1_shape is None: - logger.debug(f"one of the inputs of {add_qk} is None") - return None - - if input_0_shape != input_1_shape: - logger.debug(f"the shape of two inputs of {add_qk} is not same") - return None - - return add_qk.input[1] - - def create_attention_node( - self, - mask_index: str, - q_matmul: NodeProto, - k_matmul: NodeProto, - v_matmul: NodeProto, - q_add: NodeProto, - k_add: NodeProto, - v_add: NodeProto, - num_heads: int, - hidden_size: int, - input: str, - output: str, - add_qk_str: str, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - mask_index (str): mask input - q_matmul (NodeProto): MatMul node in fully connection for Q - k_matmul (NodeProto): MatMul node in fully connection for K - v_matmul (NodeProto): MatMul node in fully connection for V - q_add (NodeProto): Add bias node in fully connection for Q - k_add (NodeProto): Add bias node in fully connection for K - v_add (NodeProto): Add bias node in fully connection for V - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - q_weight = self.model.get_initializer(q_matmul.input[1]) - k_weight = self.model.get_initializer(k_matmul.input[1]) - v_weight = self.model.get_initializer(v_matmul.input[1]) - q_bias = self.model.get_initializer( - q_add.input[1] - ) or self.model.get_initializer(q_add.input[0]) - k_bias = self.model.get_initializer( - k_add.input[1] - ) or self.model.get_initializer(k_add.input[0]) - v_bias = self.model.get_initializer( - v_add.input[1] - ) or self.model.get_initializer(v_add.input[0]) - - if q_weight is None: - print( - f"{q_matmul.input[1]} is not an initializer. " - "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion" - ) - return None - if not (k_weight and v_weight and q_bias and k_bias): - return None - - qw = NumpyHelper.to_array(q_weight) - kw = NumpyHelper.to_array(k_weight) - vw = NumpyHelper.to_array(v_weight) - - # assert q and k have same shape as expected - assert qw.shape == kw.shape - - qw_in_size = qw.shape[0] - kw_in_size = kw.shape[0] - vw_in_size = vw.shape[0] - - assert qw_in_size == kw_in_size == vw_in_size - - if hidden_size > 0 and hidden_size != qw_in_size: - logger.warning( - f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). " - "Please provide a correct input hidden size or pass in 0" - ) - - is_qkv_diff_dims = False - - # All the matrices can have the same shape or q, k matrics can have the same shape with v being different - # For 2d weights, the shapes would be [in_size, out_size]. - # For 3d weights, shape would be [in_size, a, b] where a*b = out_size - qw_out_size = np.prod(qw.shape[1:]) - kw_out_size = np.prod(kw.shape[1:]) - vw_out_size = np.prod(vw.shape[1:]) - - qkv_weight_dim = 0 - qkv_weight = np.concatenate((qw, kw, vw), axis=1) - qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size - - qb = NumpyHelper.to_array(q_bias) - kb = NumpyHelper.to_array(k_bias) - vb = NumpyHelper.to_array(v_bias) - - q_bias_shape = np.prod(qb.shape) - k_bias_shape = np.prod(kb.shape) - v_bias_shape = np.prod(vb.shape) - - assert q_bias_shape == k_bias_shape == qw_out_size - assert v_bias_shape == vw_out_size - - qkv_bias_dim = 0 - if is_qkv_diff_dims: - qkv_bias = np.concatenate((qb, kb, vb), axis=0) - qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape - else: - qkv_bias = np.stack((qb, kb, vb), axis=0) - qkv_bias_dim = 3 * q_bias_shape - - attention_node_name = self.model.create_node_name("Attention") - - weight = helper.make_tensor( - name=attention_node_name + "_qkv_weight", - data_type=TensorProto.FLOAT, - dims=[qkv_weight_dim, qw_in_size], - vals=qkv_weight.transpose(1, 0).flatten().tolist(), - ) - - # Sometimes weights and bias are stored in fp16 - if q_weight.data_type == 10: - weight.CopyFrom( - numpy_helper.from_array( - NumpyHelper.to_array(weight).astype(np.float16), weight.name - ) - ) - self.model.add_initializer(weight, self.this_graph_name) - - bias = helper.make_tensor( - name=attention_node_name + "_qkv_bias", - data_type=TensorProto.FLOAT, - dims=[qkv_bias_dim], - vals=qkv_bias.flatten().tolist(), - ) - if q_bias.data_type == 10: - bias.CopyFrom( - numpy_helper.from_array( - NumpyHelper.to_array(bias).astype(np.float16), bias.name - ) - ) - self.model.add_initializer(bias, self.this_graph_name) - - fc_output_tensor = helper.make_tensor_value_info( - attention_node_name + "_input", TensorProto.FLOAT, [None, None, None] - ) - fc_node = helper.make_node( - "CustomFCPluginDynamic_IxRT", - inputs=[input], - outputs=[fc_output_tensor.name], - name=self.model.create_node_name("AttentionFC", "MatMul_AddBias_"), - ) - fc_node.domain = "com.iluvatar" - b = NumpyHelper.to_array(bias) - fc_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])]) - fc_node.attribute.extend([helper.make_attribute("type_id", 2)]) - fc_node.attribute.extend([helper.make_attribute("W", weight)]) - fc_node.attribute.extend([helper.make_attribute("B", bias)]) - fc_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fc_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fc_node.attribute.extend([helper.make_attribute("act_type", -1)]) - self.node_name_to_graph_name[fc_node.name] = self.this_graph_name - self.nodes_to_add.append(fc_node) - - attention_inputs = [fc_node.output[0]] - if mask_index is not None: - attention_inputs.append(mask_index) - else: - attention_inputs.append("") - - if add_qk_str is not None: - attention_inputs.append("") - attention_inputs.append(add_qk_str) - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", 1)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)]) - - if is_qkv_diff_dims: - attention_node.attribute.extend( - [ - helper.make_attribute( - "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size] - ) - ] - ) - - return attention_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - if normalize_node.op_type == "LayerNormalization": - add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) - if add_before_layernorm is not None: - start_node = add_before_layernorm - else: - return - - # SkipLayerNormalization has two inputs, and one of them is the root input for attention. - qkv_nodes = self.model.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [None, None, 0, 0, 0], - ) - if qkv_nodes is None: - qkv_nodes = self.model.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [1, None, 0, 0, 0], - ) - einsum_node = None - if qkv_nodes is not None: - (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes - else: - # Match Albert - qkv_nodes = self.model.match_parent_path( - start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0] - ) - if qkv_nodes is not None: - (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes - else: - return - - other_inputs = [] - for i, input in enumerate(start_node.input): - if input not in output_name_to_node: - continue - - if input == qkv_nodes[0].output[0]: - continue - other_inputs.append(input) - if len(other_inputs) != 1: - return - - root_input = other_inputs[0] - """ - Match flaubert Mask - | - Mul --> LayerNormalization --> Attention --> MatMul --> Add - | | - | | - +--------------------------------------------------------- - """ - mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0) - if mul_before_layernorm is not None: - mul_children = input_name_to_nodes[mul_before_layernorm.output[0]] - if mul_children is not None and len(mul_children) == 2: - layernorm_node = mul_children[1] - if layernorm_node.op_type == "LayerNormalization": - root_input = layernorm_node.output[0] - else: - return - elif mul_children is not None and len(mul_children) == 5: - root_input = mul_before_layernorm.output[0] - else: - return - elif normalize_node.op_type == "LayerNormalization": - children = input_name_to_nodes[root_input] - for child in children: - if child.op_type == "LayerNormalization": - root_input = child.output[0] - - children = input_name_to_nodes[root_input] - children_types = [child.op_type for child in children] - if children_types.count("MatMul") != 3: - return - - v_nodes = self.model.match_parent_path( - matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None] - ) - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - (_, _, add_v, matmul_v) = v_nodes - - is_distill = False - is_distill_add = False - is_mul_split = False - qk_paths = { - "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]), - "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]), - "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]), - "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]), - "path5": (["Softmax", "Add", "MatMul"], [0, 0, None]) - } - - qk_nodes = None - for k, v in qk_paths.items(): - qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1]) - if qk_nodes is None: - continue - if k == "path3": - is_distill = True - if k == "path4": - is_distill_add = True - if k == "path5": - is_mul_split = True - break - - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return - add_qk = None - matmul_qk = None - where_qk = None - if is_distill: - (_, where_qk, matmul_qk, _) = qk_nodes - elif is_distill_add: - (_, add_qk, where_qk, matmul_qk) = qk_nodes - elif is_mul_split: - (_, add_qk, matmul_qk) = qk_nodes - else: - (_, add_qk, _, matmul_qk) = qk_nodes - - q_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None] - ) - if q_nodes is None: - q_nodes = self.model.match_parent_path( - matmul_qk, - ["Div", "Transpose", "Reshape", "Add", "MatMul"], - [0, 0, 0, 0, None], - ) - if q_nodes is None and is_mul_split: - q_nodes = self.model.match_parent_path( - matmul_qk, - ["Mul", "Transpose", "Reshape", "Add", "MatMul"], - [0, 0, 0, 0, None], - ) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - reshape_q = q_nodes[-3] - add_q = q_nodes[-2] - matmul_q = q_nodes[-1] - - k_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None] - ) - if k_nodes is None: - k_nodes = self.model.match_parent_path( - matmul_qk, - ["Transpose", "Transpose", "Reshape", "Add", "MatMul"], - [1, 0, 0, 0, None], - ) - if k_nodes is None and is_mul_split: - k_nodes = self.model.match_parent_path( - matmul_qk, - ["Mul", "Transpose", "Reshape", "Add", "MatMul"], - [1, 0, 0, 0, None], - ) - - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - add_k = k_nodes[-2] - matmul_k = k_nodes[-1] - - # Note that Cast might be removed by OnnxRuntime so we match two patterns here. - mask_nodes = None - add_qk_str = None - if is_distill: - _, mask_nodes, _ = self.model.match_parent_paths( - where_qk, - [ - (["Expand", "Reshape", "Equal"], [0, 0, 0]), - (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]), - (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]), - ], - output_name_to_node, - ) - elif is_distill_add: - _, mask_nodes, _ = self.model.match_parent_paths( - where_qk, - [ - (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]), - (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]), - ], - output_name_to_node, - ) - if add_qk is not None: - add_qk_str = self.get_add_qk_str(add_qk) - if add_qk_str is None: - logger.debug( - f"fuse_attention: failed to verify shape inference of {add_qk}" - ) - return - elif is_mul_split: - _, mask_nodes, _ = self.model.match_parent_paths( - add_qk, - [ - (["Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze"], [None, 0, 0, 1, 0, 0]) - ], - output_name_to_node, - ) - else: - _, mask_nodes, _ = self.model.match_parent_paths( - add_qk, - [ - ( - ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], - [None, 0, 1, 0, 0], - ), - (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]), - (["Mul", "Sub", "Cast", "Unsqueeze"], [None, 0, 1, 0]), - ], - output_name_to_node, - ) - if mask_nodes is None: - logger.debug("fuse_attention: failed to match mask path") - return - - if ( - matmul_v.input[0] == root_input - and matmul_q.input[0] == root_input - and matmul_k.input[0] == root_input - ): - # mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) - if mask_nodes[0].op_type == "Mul": - mask_val = self.model.get_initializer(mask_nodes[0].input[1]) - if mask_val is not None: - mask_val_arr = NumpyHelper.to_array(mask_val) - mask_val_arr = np.where(mask_val_arr <= -100, -100, 0.0).astype( - np.float32 - ) - mask_val.CopyFrom( - numpy_helper.from_array(mask_val_arr, mask_val.name) - ) - mask_index = mask_nodes[0].output[0] - - attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv - - q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q) - # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads - # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately - new_node = self.create_attention_node( - mask_index, - matmul_q, - matmul_k, - matmul_v, - add_q, - add_k, - add_v, - q_num_heads, - q_hidden_size, - root_input, - attention_last_node.output[0], - add_qk_str, - ) - if new_node is None: - return - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - if einsum_node is not None: - unique_index = einsum_node.input[0] - new_edge = "edge_modified_" + unique_index - shape_tensor = helper.make_tensor( - name="shape_modified_tensor" + unique_index, - data_type=TensorProto.INT64, - dims=[4], - vals=np.int64( - [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)] - ).tobytes(), - raw=True, - ) - self.model.add_initializer(shape_tensor, self.this_graph_name) - self.model.add_node( - helper.make_node( - "Reshape", - [attention_last_node.output[0], shape_tensor.name], - [new_edge], - "reshape_modified_" + unique_index, - ), - self.this_graph_name, - ) - einsum_node.input[0] = new_edge - - self.nodes_to_remove.extend( - [attention_last_node, transpose_qkv, matmul_qkv] - ) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes) - self.nodes_to_remove.extend(k_nodes) - self.nodes_to_remove.extend(v_nodes) - - # Use prune graph to remove mask nodes since they are shared by all attention nodes. - # self.nodes_to_remove.extend(mask_nodes) - self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py deleted file mode 100644 index 38ddf62986b46b350cdf158eeccfcf1e3602fe0c..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py +++ /dev/null @@ -1,634 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class AttentionMask: - """ - Fuse Attention subgraph into one Attention node. - """ - - def __init__(self, model: OnnxModel): - self.model = model - # A lookup table with mask input as key, and mask index output as value - self.mask_indice = {} - # A lookup table with mask input as key, and cast (to int32) output as value - self.mask_casted = {} - self.utils = FusionUtils(model) - self.mask_format = AttentionMaskFormat.MaskIndexEnd - - def set_mask_format(self, mask_format: AttentionMaskFormat): - self.mask_format = mask_format - - def set_mask_indice(self, mask, mask_index): - if mask in self.mask_indice: - assert mask_index == self.mask_indice[mask] - self.mask_indice[mask] = mask_index - - def get_first_mask(self): - assert len(self.mask_indice) > 0 - return next(iter(self.mask_indice)) - - def process_mask(self, input: str) -> str: - if self.mask_format == AttentionMaskFormat.NoMask: - return None - - if input in self.mask_indice: - return self.mask_indice[input] - - # Add cast to convert int64 to int32 - if self.model.find_graph_input(input): - casted, input_name = self.utils.cast_graph_input_to_int32(input) - else: - input_name, cast_node = self.utils.cast_input_to_int32(input) - casted = True - - if casted: - self.mask_casted[input] = input_name - - # Attention supports int32 attention mask (2D) since 1.4.0 - if self.mask_format == AttentionMaskFormat.AttentionMask: - self.mask_indice[input] = input_name - return input_name - - # Add a mask processing node to convert attention mask to mask index (1D) - output_name = self.model.create_node_name("mask_index") - mask_index_node = helper.make_node( - "ReduceSum", - inputs=[input_name], - outputs=[output_name], - name=self.model.create_node_name("ReduceSum", "MaskReduceSum"), - ) - mask_index_node.attribute.extend( - [helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)] - ) - self.model.add_node(mask_index_node) - - self.mask_indice[input] = output_name - return output_name - - -class FusionAttention(Fusion): - """ - Fuse Attention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - hidden_size: int, - num_heads: int, - attention_mask: AttentionMask, - ): - super().__init__( - model, "Attention", ["SkipLayerNormalization", "LayerNormalization"] - ) - self.hidden_size = hidden_size - self.num_heads = num_heads - self.attention_mask = attention_mask - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape = self.model.get_initializer(reshape_q.input[1]) - if q_shape is None: - logger.debug(f"{reshape_q.input[1]} is not initializer.") - return self.num_heads, self.hidden_size # Fall back to user specified value - - q_shape_value = NumpyHelper.to_array(q_shape) - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug( - f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return self.num_heads, self.hidden_size # Fall back to user specified value - - num_heads = q_shape_value[2] - head_size = q_shape_value[3] - hidden_size = num_heads * head_size - - if self.num_heads > 0 and num_heads != self.num_heads: - if self.num_heads_warning: - logger.warning( - f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value." - ) - self.num_heads_warning = False # Do not show the warning more than once - - if self.hidden_size > 0 and hidden_size != self.hidden_size: - if self.hidden_size_warning: - logger.warning( - f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value." - ) - self.hidden_size_warning = ( - False # Do not show the warning more than once - ) - - return num_heads, hidden_size - - def get_add_qk_str(self, add_qk: NodeProto): - shape_infer = self.model.infer_runtime_shape(update=True) - if shape_infer is None: - return - - input_0_shape = shape_infer.get_edge_shape(add_qk.input[0]) - input_1_shape = shape_infer.get_edge_shape(add_qk.input[1]) - - if input_0_shape is None or input_1_shape is None: - logger.debug(f"one of the inputs of {add_qk} is None") - return None - - if input_0_shape != input_1_shape: - logger.debug(f"the shape of two inputs of {add_qk} is not same") - return None - - return add_qk.input[1] - - def create_attention_node( - self, - mask_index: str, - q_matmul: NodeProto, - k_matmul: NodeProto, - v_matmul: NodeProto, - q_add: NodeProto, - k_add: NodeProto, - v_add: NodeProto, - num_heads: int, - hidden_size: int, - input: str, - output: str, - add_qk_str: str, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - mask_index (str): mask input - q_matmul (NodeProto): MatMul node in fully connection for Q - k_matmul (NodeProto): MatMul node in fully connection for K - v_matmul (NodeProto): MatMul node in fully connection for V - q_add (NodeProto): Add bias node in fully connection for Q - k_add (NodeProto): Add bias node in fully connection for K - v_add (NodeProto): Add bias node in fully connection for V - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - q_weight = self.model.get_initializer(q_matmul.input[1]) - k_weight = self.model.get_initializer(k_matmul.input[1]) - v_weight = self.model.get_initializer(v_matmul.input[1]) - q_bias = self.model.get_initializer( - q_add.input[1] - ) or self.model.get_initializer(q_add.input[0]) - k_bias = self.model.get_initializer( - k_add.input[1] - ) or self.model.get_initializer(k_add.input[0]) - v_bias = self.model.get_initializer( - v_add.input[1] - ) or self.model.get_initializer(v_add.input[0]) - - if q_weight is None: - print( - f"{q_matmul.input[1]} is not an initializer. " - "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion" - ) - return None - if not (k_weight and v_weight and q_bias and k_bias): - return None - - qw = NumpyHelper.to_array(q_weight) - kw = NumpyHelper.to_array(k_weight) - vw = NumpyHelper.to_array(v_weight) - - # assert q and k have same shape as expected - assert qw.shape == kw.shape - - qw_in_size = qw.shape[0] - kw_in_size = kw.shape[0] - vw_in_size = vw.shape[0] - - assert qw_in_size == kw_in_size == vw_in_size - - if hidden_size > 0 and hidden_size != qw_in_size: - logger.warning( - f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). " - "Please provide a correct input hidden size or pass in 0" - ) - - is_qkv_diff_dims = False - if qw.shape != vw.shape: - is_qkv_diff_dims = True - - # All the matrices can have the same shape or q, k matrics can have the same shape with v being different - # For 2d weights, the shapes would be [in_size, out_size]. - # For 3d weights, shape would be [in_size, a, b] where a*b = out_size - qw_out_size = np.prod(qw.shape[1:]) - kw_out_size = np.prod(kw.shape[1:]) - vw_out_size = np.prod(vw.shape[1:]) - - qkv_weight_dim = 0 - if is_qkv_diff_dims: - qkv_weight = np.concatenate((qw, kw, vw), axis=1) - qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size - else: - qkv_weight = np.stack((qw, kw, vw), axis=1) - qkv_weight_dim = 3 * qw_out_size - - qb = NumpyHelper.to_array(q_bias) - kb = NumpyHelper.to_array(k_bias) - vb = NumpyHelper.to_array(v_bias) - - q_bias_shape = np.prod(qb.shape) - k_bias_shape = np.prod(kb.shape) - v_bias_shape = np.prod(vb.shape) - - assert q_bias_shape == k_bias_shape == qw_out_size - assert v_bias_shape == vw_out_size - - qkv_bias_dim = 0 - if is_qkv_diff_dims: - qkv_bias = np.concatenate((qb, kb, vb), axis=0) - qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape - else: - qkv_bias = np.stack((qb, kb, vb), axis=0) - qkv_bias_dim = 3 * q_bias_shape - - attention_node_name = self.model.create_node_name("Attention") - - weight = helper.make_tensor( - name=attention_node_name + "_qkv_weight", - data_type=TensorProto.FLOAT, - dims=[qw_in_size, qkv_weight_dim], - vals=qkv_weight.flatten().tolist(), - ) - - # Sometimes weights and bias are stored in fp16 - if q_weight.data_type == 10: - weight.CopyFrom( - numpy_helper.from_array( - NumpyHelper.to_array(weight).astype(np.float16), weight.name - ) - ) - self.model.add_initializer(weight, self.this_graph_name) - - bias = helper.make_tensor( - name=attention_node_name + "_qkv_bias", - data_type=TensorProto.FLOAT, - dims=[qkv_bias_dim], - vals=qkv_bias.flatten().tolist(), - ) - if q_bias.data_type == 10: - bias.CopyFrom( - numpy_helper.from_array( - NumpyHelper.to_array(bias).astype(np.float16), bias.name - ) - ) - self.model.add_initializer(bias, self.this_graph_name) - - attention_inputs = [ - input, - attention_node_name + "_qkv_weight", - attention_node_name + "_qkv_bias", - ] - if mask_index is not None: - attention_inputs.append(mask_index) - else: - attention_inputs.append("") - - if add_qk_str is not None: - attention_inputs.append("") - attention_inputs.append(add_qk_str) - - attention_node = helper.make_node( - "Attention", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - - if is_qkv_diff_dims: - attention_node.attribute.extend( - [ - helper.make_attribute( - "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size] - ) - ] - ) - - return attention_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - if normalize_node.op_type == "LayerNormalization": - add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) - if add_before_layernorm is not None: - start_node = add_before_layernorm - else: - return - - # SkipLayerNormalization has two inputs, and one of them is the root input for attention. - qkv_nodes = self.model.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [None, None, 0, 0, 0], - ) - einsum_node = None - if qkv_nodes is not None: - (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes - else: - # Match Albert - qkv_nodes = self.model.match_parent_path( - start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0] - ) - if qkv_nodes is not None: - (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes - else: - return - - other_inputs = [] - for i, input in enumerate(start_node.input): - if input not in output_name_to_node: - continue - - if input == qkv_nodes[0].output[0]: - continue - other_inputs.append(input) - if len(other_inputs) != 1: - return - - root_input = other_inputs[0] - """ - Match flaubert Mask - | - Mul --> LayerNormalization --> Attention --> MatMul --> Add - | | - | | - +--------------------------------------------------------- - """ - mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0) - if mul_before_layernorm is not None: - mul_children = input_name_to_nodes[mul_before_layernorm.output[0]] - if mul_children is not None and len(mul_children) == 2: - layernorm_node = mul_children[1] - if layernorm_node.op_type == "LayerNormalization": - root_input = layernorm_node.output[0] - else: - return - elif mul_children is not None and len(mul_children) == 5: - root_input = mul_before_layernorm.output[0] - else: - return - elif normalize_node.op_type == "LayerNormalization": - children = input_name_to_nodes[root_input] - for child in children: - if child.op_type == "LayerNormalization": - root_input = child.output[0] - - children = input_name_to_nodes[root_input] - children_types = [child.op_type for child in children] - if children_types.count("MatMul") != 3: - return - - v_nodes = self.model.match_parent_path( - matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None] - ) - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - (_, _, add_v, matmul_v) = v_nodes - - is_distill = False - is_distill_add = False - qk_paths = { - "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]), - "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]), - "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]), - "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]), - } - - qk_nodes = None - for k, v in qk_paths.items(): - qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1]) - if qk_nodes is None: - continue - if k == "path3": - is_distill = True - if k == "path4": - is_distill_add = True - break - - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return - - add_qk = None - matmul_qk = None - where_qk = None - if is_distill: - (_, where_qk, matmul_qk, _) = qk_nodes - elif is_distill_add: - (_, add_qk, where_qk, matmul_qk) = qk_nodes - else: - (_, add_qk, _, matmul_qk) = qk_nodes - - q_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None] - ) - if q_nodes is None: - q_nodes = self.model.match_parent_path( - matmul_qk, - ["Div", "Transpose", "Reshape", "Add", "MatMul"], - [0, 0, 0, 0, None], - ) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - reshape_q = q_nodes[-3] - add_q = q_nodes[-2] - matmul_q = q_nodes[-1] - - k_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None] - ) - if k_nodes is None: - k_nodes = self.model.match_parent_path( - matmul_qk, - ["Transpose", "Transpose", "Reshape", "Add", "MatMul"], - [1, 0, 0, 0, None], - ) - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - add_k = k_nodes[-2] - matmul_k = k_nodes[-1] - - # Note that Cast might be removed by OnnxRuntime so we match two patterns here. - mask_nodes = None - add_qk_str = None - if is_distill: - _, mask_nodes, _ = self.model.match_parent_paths( - where_qk, - [ - (["Expand", "Reshape", "Equal"], [0, 0, 0]), - (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]), - (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]), - ], - output_name_to_node, - ) - elif is_distill_add: - _, mask_nodes, _ = self.model.match_parent_paths( - where_qk, - [ - (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]), - (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]), - ], - output_name_to_node, - ) - if add_qk is not None: - add_qk_str = self.get_add_qk_str(add_qk) - if add_qk_str is None: - logger.debug( - f"fuse_attention: failed to verify shape inference of {add_qk}" - ) - return - else: - _, mask_nodes, _ = self.model.match_parent_paths( - add_qk, - [ - ( - ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], - [None, 0, 1, 0, 0], - ), - (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]), - ], - output_name_to_node, - ) - if mask_nodes is None: - logger.debug("fuse_attention: failed to match mask path") - return - - if ( - matmul_v.input[0] == root_input - and matmul_q.input[0] == root_input - and matmul_k.input[0] == root_input - ): - mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) - - attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv - - q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q) - # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads - # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately - new_node = self.create_attention_node( - mask_index, - matmul_q, - matmul_k, - matmul_v, - add_q, - add_k, - add_v, - q_num_heads, - q_hidden_size, - root_input, - attention_last_node.output[0], - add_qk_str, - ) - if new_node is None: - return - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - if einsum_node is not None: - unique_index = einsum_node.input[0] - new_edge = "edge_modified_" + unique_index - shape_tensor = helper.make_tensor( - name="shape_modified_tensor" + unique_index, - data_type=TensorProto.INT64, - dims=[4], - vals=np.int64( - [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)] - ).tobytes(), - raw=True, - ) - self.model.add_initializer(shape_tensor, self.this_graph_name) - self.model.add_node( - helper.make_node( - "Reshape", - [attention_last_node.output[0], shape_tensor.name], - [new_edge], - "reshape_modified_" + unique_index, - ), - self.this_graph_name, - ) - einsum_node.input[0] = new_edge - - self.nodes_to_remove.extend( - [attention_last_node, transpose_qkv, matmul_qkv] - ) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes) - self.nodes_to_remove.extend(k_nodes) - self.nodes_to_remove.extend(v_nodes) - - # Use prune graph to remove mask nodes since they are shared by all attention nodes. - # self.nodes_to_remove.extend(mask_nodes) - self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py deleted file mode 100644 index 3732b0f5fab40cbb269f18abdd56286f298a5493..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger -from typing import List, Union - -from onnx import GraphProto - -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class Fusion: - def __init__( - self, - model: OnnxModel, - fused_op_type: str, - search_op_types: Union[str, List[str]], - description: str = None, - ): - self.search_op_types: List[str] = ( - [search_op_types] if isinstance(search_op_types, str) else search_op_types - ) - self.fused_op_type: str = fused_op_type - self.description: str = ( - f"{fused_op_type}({description})" if description else fused_op_type - ) - self.model: OnnxModel = model - self.nodes_to_remove: List = [] - self.nodes_to_add: List = [] - self.prune_graph: bool = False - self.node_name_to_graph_name: dict = {} - self.this_graph_name: str = None - # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter. - self.fused_count: int = 0 - - def apply(self): - logger.debug(f"start {self.description} fusion...") - input_name_to_nodes = self.model.input_name_to_nodes() - output_name_to_node = self.model.output_name_to_node() - - # This assumes that two search ops will not be fused at same time! - for search_op_type in self.search_op_types: - for node in self.model.get_nodes_by_op_type(search_op_type): - graph = self.model.get_graph_by_node(node) - if graph is None: - raise Exception("Can not find node in any graphs") - self.this_graph_name = graph.name - self.fuse(node, input_name_to_nodes, output_name_to_node) - - op_list = [node.op_type for node in self.nodes_to_add] - count = max(self.fused_count, op_list.count(self.fused_op_type)) - if count > 0: - logger.info(f"Fused {self.description} count: {count}") - - self.model.remove_nodes(self.nodes_to_remove) - self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name) - - if self.prune_graph: - self.model.prune_graph() - elif self.nodes_to_remove or self.nodes_to_add: - self.model.update_graph() - - def match_parent_path_from_dict( - self, start_node, path_dict, output_name_to_node=None, return_indice=None - ): - res_path = None - res_nodes = None - for k, v in path_dict.items(): - res_nodes = self.model.match_parent_path( - start_node, - v[0], - v[1], - output_name_to_node=output_name_to_node, - return_indice=return_indice, - ) - if res_nodes is None: - continue - return res_nodes, k - return res_nodes, res_path diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py deleted file mode 100644 index 045cd99380a7535079d0f9f33322e2879d2074c0..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionBiasGelu(Fusion): - def __init__(self, model: OnnxModel, is_fastgelu): - if is_fastgelu: - super().__init__(model, "FastGelu", "FastGelu", "add bias") - else: - super().__init__(model, "BiasGelu", "Gelu") - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - gelu_op_type = node.op_type - fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu" - - if len(node.input) != 1: - return - - nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None]) - if nodes is None: - return - (add, matmul) = nodes - - bias_weight = None - # bias should be one dimension - bias_index = -1 - for i, input in enumerate(add.input): - initializer = self.model.get_initializer(input) - if initializer is None: - continue - bias_index = i - bias_weight = NumpyHelper.to_array(initializer) - break - if bias_weight is None: - return - if len(bias_weight.shape) != 1: - return - - subgraph_nodes = [node, add] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - - fused_node = helper.make_node( - fuse_op_type, - inputs=[matmul.output[0], add.input[bias_index]], - outputs=node.output, - name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"), - ) - fused_node.domain = "com.microsoft" - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py deleted file mode 100644 index 21161727373b1ceee5362bc2fa0e713f17e899ae..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionConformerAttention(Fusion): - """ - Fuse VideoBertAttention subgraph into one Attention node. - """ - - def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int): - super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["Concat"]) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - self.hidden_size = hidden_size - self.num_heads = num_heads - - def get_num_heads_and_hidden_size( - self, atten_matmul: NodeProto, div: NodeProto - ) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1]) - div_initializer = self.model.get_initializer(div.input[1]) - - # 检查float_data是否为空 - if len(div_initializer.float_data) > 0: - div_value = div_initializer.float_data[0] - else: - # 如果float_data为空,尝试其他方式获取数据 - # 例如,如果数据存储在raw_data中 - if len(div_initializer.raw_data) > 0: - dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type] - div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0] - else: - raise ValueError("Data not found in the div_initializer") - - atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape - head_dim = math.ceil(div_value * div_value) - hidden_size = atten_matul_shape_value[0] - num_heads = hidden_size // head_dim - - return num_heads, hidden_size - - def create_attention_node( - self, num_heads: int, hidden_size: int, inputs: str, outputs: str - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - attention_node_name = self.model.create_node_name("Attention") - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=inputs, - outputs=outputs, - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", 1)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)]) - - return attention_node - - def fuse_reshape(self, shape_data_name): - - shape_tensor = helper.make_tensor( - name=shape_data_name, - data_type=TensorProto.INT64, - dims=[3], - vals=np.int64([128, -1, self.hidden_size // self.num_heads]).tobytes(), - raw=True, - ) - self.model.add_initializer(shape_tensor, self.this_graph_name) - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - - paths = { - "path": ( - ["Unsqueeze", "Mul", "Gather", "Shape", "LayerNormalization"], - [None, None, None, None, None], - ), - } - - reshape_nodes, reshape_path = self.match_parent_path_from_dict( - start_node, paths - ) - if reshape_nodes is None: - return - - self.nodes_to_remove.append(start_node) - - self.nodes_to_remove.extend(reshape_nodes[:-1]) - self.fuse_reshape(start_node.output[0]) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py deleted file mode 100644 index b55c2412b07067d3ebb05cc080be6a3a31902e22..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple, Union - -import numpy as np -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionConformerXSoftmax(Fusion): - """ - Fuse Where + Softmax + Where into one node: XSoftmax - """ - - def __init__(self, model: OnnxModel): - super().__init__(model, "XSoftmax_IxRT", "Softmax") - - def create_xsoftmax_node( - self, data_input: str, mask_input: str, output: str - ) -> Union[NodeProto, None]: - """Create an XSoftmax node. - - Args: - data_input (str): data input name - mask_input (str): max input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - - unique_index = data_input - new_edge = "edge_modified_" + unique_index - shape_tensor = helper.make_tensor( - name="shape_modified_tensor_" + unique_index, - data_type=TensorProto.INT64, - dims=[4], - vals=np.int64( - [-1, 8, 128, 128] # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN) - ).tobytes(), - raw=True, - ) - self.model.add_initializer(shape_tensor, self.this_graph_name) - self.model.add_node( - helper.make_node( - "Reshape", - [data_input, shape_tensor.name], - [new_edge], - "reshape_modified_" + unique_index, - ), - self.this_graph_name, - ) - - new_edge2 = "edge_modified2_" + unique_index - xsoftmax_node_name = self.model.create_node_name("XSoftmax") - - xsoftmax_node = helper.make_node( - "XSoftmax_IxRT", - inputs=[new_edge, mask_input], - outputs=[new_edge2], - name=xsoftmax_node_name, - ) - xsoftmax_node.domain = "com.iluvatar" - xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)]) - xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)]) - xsoftmax_node.attribute.extend([helper.make_attribute("is_conformer", 1)]) - - shape_tensor2 = helper.make_tensor( - name="shape_modified_tensor2_" + unique_index, - data_type=TensorProto.INT64, - dims=[3], - vals=np.int64( - [-1, 128, 128] # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN) - ).tobytes(), - raw=True, - ) - self.model.add_initializer(shape_tensor2, self.this_graph_name) - self.model.add_node( - helper.make_node( - "Reshape", - [new_edge2, shape_tensor2.name], - [output], - "reshape_modified2_" + unique_index, - ), - self.this_graph_name, - ) - - return xsoftmax_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - xsoftmax_paths = { - "path": (["Add", "Where", "Reshape", "Expand"], [None, None, None, None]), - } - xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict( - node, xsoftmax_paths - ) - - if xsoftmax_nodes is None: - logger.debug("fuse_xsoftmax: failed to match xsoftmax path") - return - else: - (add_node, where_node, reshape_node, expand_node) = xsoftmax_nodes - - mask_input = expand_node.input[0] - - data_output = node.output[0] - - data_input = add_node.input[0] - if where_node.output[0] == add_node.input[0]: - data_input = add_node.input[1] - xsoftmax_node = self.create_xsoftmax_node( - data_input, mask_input, data_output - ) - - self.nodes_to_remove.extend(xsoftmax_nodes) - self.nodes_to_add.append(xsoftmax_node) - self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py deleted file mode 100644 index 23cdd0c2d0dca61bf66eb1f484e3093f4d7bf0c6..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py +++ /dev/null @@ -1,128 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionConvReformat(Fusion): - """ - Fuse FusionPVTAttention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "FuseConvReformat_IxRT", - ["Transpose"], - ) - - - - def create_fuse_node( - self, inputs: str, outputs: str, before_conv: int, shape_data: list, prefix - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - - node_name = self.model.create_node_name(f"FuseConvReformat_{prefix}") - node = helper.make_node( - "FuseConvReformat_IxRT", - inputs=inputs, - outputs=outputs, - name=node_name, - ) - node.domain = "com.iluvatar" - - node.attribute.extend([helper.make_attribute("before_conv", before_conv)]) - node.attribute.extend([helper.make_attribute("shape_data", shape_data)]) - node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - return node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - """ - eliminate Transpose(linear->nchw) + Transpose - path: - ----->Transpose ---->Reshape---> conv ----->Reshape ---->Transpose---> - - to: - ----->FuseConvReformat_IxRT---> conv ----->FuseConvReformat_IxRT---> - - """ - start_node = node - paths = { - "path": (["Reshape", "Conv", "Reshape","Transpose"], [0, 0, 0, 0]), # cross attention qery pass - } - - nodes, path = self.match_parent_path_from_dict(start_node, paths) - - if nodes is None: - logger.debug("FuseConvReformat: failed to match path") - return - - (reshape_after_node, conv_node, reshape_before_node, tranpose_before_node) = nodes - - perm1 = tranpose_before_node.attribute[0].ints - if perm1 !=[0, 2, 1]: - return - perm2 = start_node.attribute[0].ints - if perm2 !=[0, 2, 1]: - return - - before_shape_data = numpy_helper.to_array(self.model.get_initializer(reshape_before_node.input[1])) - - if before_shape_data.shape[0] != 4: - return - - after_shape_data = numpy_helper.to_array(self.model.get_initializer(reshape_after_node.input[1])) - if after_shape_data.shape[0] != 3: - return - node1_inputs = tranpose_before_node.input - node1_outputs = reshape_before_node.output - node1_before_conv = 1 - - new_node1 = self.create_fuse_node( - node1_inputs, node1_outputs, node1_before_conv, before_shape_data,"before") - - - node2_inputs = conv_node.output - node2_outputs = start_node.output - node2_before_conv = 0 - new_node2 = self.create_fuse_node( - node2_inputs, node2_outputs, node2_before_conv, after_shape_data,"after") - - self.nodes_to_add.append(new_node1) - self.nodes_to_add.append(new_node2) - self.node_name_to_graph_name[new_node1.name] = self.this_graph_name - self.node_name_to_graph_name[new_node2.name] = self.this_graph_name - self.nodes_to_remove.extend([start_node, reshape_after_node,reshape_before_node,tranpose_before_node]) - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py deleted file mode 100644 index 5bfa8768e7077fad40b9ef8ff51427db217a5069..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - - -class FusionCosyvoiceAttention(Fusion): - """ - Fuse T5Attention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQkvCrossToContext_IxRT", - ["Softmax"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape = self.model.get_initializer(reshape_q.input[1]) - if q_shape is None: - logger.debug(f"{reshape_q.input[1]} is not initializer.") - return [0, 0] - - q_shape_value = NumpyHelper.to_array(q_shape) - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug( - f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return [0, 0] - - num_heads = q_shape_value[2] - head_size = q_shape_value[3] - hidden_size = num_heads * head_size - - return num_heads, hidden_size - - def create_decoder_attention_node( - self, inputs: str, outputs: str, type_mask: int, has_mask: int, scale: float - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - - attention_node_name = self.model.create_node_name("decoder_Attention") - attention_node = helper.make_node( - "CustomQkvCrossToContext_IxRT", - inputs=inputs, - outputs=outputs, - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("scale", scale)]) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)]) - - return attention_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - """ - path1: - - (query) --------------MatMul---Div --> add -->softmax --->MatMul---> - / / / - (key) ---->Transpose > / / - / / - (mask) ------------------------> / - / - (value)---------------------------------------------> - """ - - - - - import pdb - start_node = node - qkv_paths = { - "path1": ( - ["Add", "Div", "MatMul", "Transpose"], - [None, 0, None, 1], - ), # float mask self attention,self attention key pass - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - next_nodes = self.model.get_children(node) - - if len(next_nodes) == 0: - return - - if next_nodes[0].op_type != "MatMul": - return - - second_matmul_node = next_nodes[0] - attention_inputs = None - attention_outputs = second_matmul_node.output - remove_nodes = [second_matmul_node, node] - - (add_node, div_node, first_matmul_node, transpose_node) = qkv_nodes - transpose_nodes = self.model.get_parents(first_matmul_node) - q_input = transpose_nodes[0].output[0] - - k_transpose_node = transpose_nodes[1] - k_transpose_node_perm = k_transpose_node.attribute[0].ints - - if k_transpose_node_perm == [0, 2, 3, 1]: #transpose has bean merge,[0,2,1,3]->[0, 1, 3, 2] = [0, 2, 3, 1] - k_input = transpose_nodes[1].output[0] - - transpose_nodes[1].attribute[0].ints[0] = 0 - transpose_nodes[1].attribute[0].ints[1] = 2 - transpose_nodes[1].attribute[0].ints[2] = 1 - transpose_nodes[1].attribute[0].ints[3] = 3 - - remove_nodes.extend([add_node, div_node, first_matmul_node]) - - elif k_transpose_node_perm == [0, 1, 3, 2]: - k_input = transpose_nodes[1].input[0] - remove_nodes.extend([add_node, div_node, first_matmul_node,k_transpose_node]) - - else: - return - - v_input = second_matmul_node.input[1] - attention_inputs = [q_input, k_input, v_input] - - has_mask = 1 - type_mask = 3 # float mask - - mask_input = add_node.input[0] - score_out = div_node.output[0] - if add_node.input[0] == score_out: - mask_input = add_node.input[1] - attention_inputs.append(mask_input) - - scale_data = self.model.get_initializer_input_edges(div_node.name, return_np_array = True) - scale = 1.0 / scale_data[0] - - atten_node = self.create_decoder_attention_node( - attention_inputs, attention_outputs, type_mask, has_mask, scale - ) - - self.nodes_to_add.append(atten_node) - self.node_name_to_graph_name[atten_node.name] = self.this_graph_name - self.nodes_to_remove.extend(remove_nodes) - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py deleted file mode 100755 index d1a1baffd56aba589caa4251d7d841e9715b8f02..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple, Union - -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionSplitQKV(Fusion): - """ - Fuse FusionSplitQKV - """ - - def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int): - super().__init__(model, "SplitQKV_IxRT", "Split") - - self.hidden_size = hidden_size - self.num_heads = num_heads - - def create_node( - self, inputs: list, outputs:list - ) -> Union[NodeProto, None]: - """Create an create node. - - Args: - data_input (str): data input name - mask_input (str): max input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - node_name = self.model.create_node_name("SplitQKV_IxRT") - - - k_cache_output = outputs[1] - v_cache_output = outputs[2] - - concat_k_input = k_cache_output + "_k_concat_input" - concat_v_input = v_cache_output + "_v_concat_input" - - plugin_outputs = [outputs[0],concat_k_input,concat_v_input] - - new_node = helper.make_node( - "SplitQKV_IxRT", - inputs=inputs, - outputs=plugin_outputs, - name=node_name, - ) - new_node.domain = "com.iluvatar" - new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - new_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - new_node.attribute.extend( - [helper.make_attribute("atten_scale", 1.0)] - ) - new_node.attribute.extend( - [helper.make_attribute("transpose", 1)] - ) - new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)]) - new_node.attribute.extend( - [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)] - ) - - - - k_concat_node_name = node_name + "_k_concat" - v_concat_node_name = node_name + "_v_concat" - - k_concat_node = helper.make_node( - "Identity", - inputs=[concat_k_input], - outputs=[outputs[1]], - name=k_concat_node_name, - ) - - v_concat_node = helper.make_node( - "Identity", - inputs=[concat_v_input], - outputs=[outputs[2]], - name=v_concat_node_name, - ) - - self.model.replace_input_of_all_nodes(outputs[1],concat_k_input) - self.model.replace_input_of_all_nodes(outputs[2],concat_v_input) - return new_node,k_concat_node,v_concat_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - split_node = node - split_data = self.model.get_initializer_input_edges(node.name,return_np_array = True) - if split_data[0].shape != (3,): - return - if split_data[0][0] != split_data[0][1] and split_data[0][1] != split_data[0][2]: - return - - q_input, k_input, v_input = node.output[0],node.output[1],node.output[2] - - q_path_nodes= [] - k_path_nodes= [] - v_path_nodes= [] - - reshape_nodes = self.model.get_children(node) - - for node in reshape_nodes: - if node.op_type != "Reshape": - return - q_reshape_node,k_reshape_node,v_reshape_node = reshape_nodes[0],reshape_nodes[1],reshape_nodes[2] - - q_path_nodes.append(q_reshape_node) - k_path_nodes.append(k_reshape_node) - v_path_nodes.append(v_reshape_node) - - q_transpose_nodes = self.model.get_children(q_reshape_node) - k_transpose_nodes = self.model.get_children(k_reshape_node) - v_transpose_nodes = self.model.get_children(v_reshape_node) - - if len(q_transpose_nodes)!=1 and (not k_transpose_nodes) and len(v_transpose_nodes) != 1: - return - - - if (q_transpose_nodes[0].attribute[0].ints != [0, 2, 1, 3]) and (v_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]): - return - - if len(k_transpose_nodes) == 2: - if (k_transpose_nodes[0].attribute[0].ints != k_transpose_nodes[1].attribute[0].ints) and (k_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]): - return - - - if len(k_transpose_nodes) == 1: - if (k_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]): - return - - - q_transpose_node = q_transpose_nodes[0] - k_transpose_node_0 = k_transpose_nodes[0] - v_transpose_node = v_transpose_nodes[0] - - k_output = k_transpose_node_0.output[0] - - if len(k_transpose_nodes) == 2: - k_transpose_node_1 = k_transpose_nodes[1] - next_node = self.model.get_children(k_transpose_node_1) - if not next_node: - return - - self.model.replace_node_input(next_node[0], k_transpose_node_1.output[0], k_transpose_node_0.output[0]) - - - q_path_nodes.append(q_transpose_node) - v_path_nodes.append(v_transpose_node) - k_path_nodes.extend(k_transpose_nodes) - - plugin_inputs = [split_node.input[0]] - plugin_outputs = [q_transpose_node.output[0], k_output,v_transpose_node.output[0]] - - remove_nodes = [split_node] - - remove_nodes.extend(q_path_nodes) - remove_nodes.extend(k_path_nodes) - remove_nodes.extend(v_path_nodes) - - new_node,k_cache_concat_node, v_cache_concat_node = self.create_node(plugin_inputs, plugin_outputs) - - self.nodes_to_add.append(new_node) - self.nodes_to_add.append(k_cache_concat_node) - self.nodes_to_add.append(v_cache_concat_node) - - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.node_name_to_graph_name[k_cache_concat_node.name] = self.this_graph_name - self.node_name_to_graph_name[v_cache_concat_node.name] = self.this_graph_name - self.nodes_to_remove.extend(remove_nodes) - - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py deleted file mode 100644 index 6b1599d4b27cf32c74dc9c294564490ff1e799da..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py +++ /dev/null @@ -1,188 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple, Union - -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionCosyVoiceSplitQKVUpdateKVCache(Fusion): - """ - Fuse FusionSplitQKVUpdateKVCache - """ - - def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int): - super().__init__( - model, "SplitQKVUpdateKVCache_IxRT", "Split" - ) - - self.hidden_size = hidden_size - self.num_heads = num_heads - - def create_node( - self, - inputs: list, - outputs: list, - ) -> Union[NodeProto, None]: - """Create an XSoftmax node. - - Args: - data_input (str): data input name - mask_input (str): max input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - node_name = self.model.create_node_name("SplitQKVUpdateKVCache_IxRT") - - k_cache_output = outputs[1] - v_cache_output = outputs[2] - - concat_k_input = k_cache_output + "_k_concat_input" - concat_v_input = v_cache_output + "_v_concat_input" - - plugin_outputs = [outputs[0],concat_k_input,concat_v_input] - - new_node = helper.make_node( - "SplitQKVUpdateKVCache_IxRT", - inputs=inputs, - outputs=plugin_outputs, - name=node_name, - ) - - k_concat_node_name = node_name + "_k_concat" - v_concat_node_name = node_name + "_v_concat" - - k_concat_node = helper.make_node( - "Identity", - inputs=[concat_k_input], - outputs=[outputs[1]], - name=k_concat_node_name, - ) - - - - v_concat_node = helper.make_node( - "Identity", - inputs=[concat_v_input], - outputs=[outputs[2]], - name=v_concat_node_name, - ) - - - - - - - new_node.domain = "com.iluvatar" - new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - new_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)]) - new_node.attribute.extend( - [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)] - ) - - self.model.replace_input_of_all_nodes(outputs[1],concat_k_input) - self.model.replace_input_of_all_nodes(outputs[2],concat_v_input) - - return new_node,k_concat_node,v_concat_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - split_node = node - split_data = self.model.get_initializer_input_edges(node.name,return_np_array = True) - if split_data[0].shape != (3,): - return - if split_data[0][0] != split_data[0][1] and split_data[0][1] != split_data[0][2]: - return - - q_input, k_input, v_input = node.output[0],node.output[1],node.output[2] - - q_path_nodes= [] - k_path_nodes= [] - v_path_nodes= [] - - reshape_nodes = self.model.get_children(node) - - for node in reshape_nodes: - if node.op_type != "Reshape": - return - q_reshape_node,k_reshape_node,v_reshape_node = reshape_nodes[0],reshape_nodes[1],reshape_nodes[2] - - q_path_nodes.append(q_reshape_node) - k_path_nodes.append(k_reshape_node) - v_path_nodes.append(v_reshape_node) - - q_transpose_nodes = self.model.get_children(q_reshape_node) - k_transpose_nodes = self.model.get_children(k_reshape_node) - v_transpose_nodes = self.model.get_children(v_reshape_node) - - if len(q_transpose_nodes)!=1 and len(k_transpose_nodes) != 1 and len(v_transpose_nodes) != 1: - return - - - q_transpose_node = q_transpose_nodes[0] - - k_transpose_node = k_transpose_nodes[0] - v_transpose_node = v_transpose_nodes[0] - - k_path_nodes.append(k_transpose_node) - v_path_nodes.append(v_transpose_node) - - - k_concat_nodes = self.model.get_children(k_transpose_node) - v_concat_nodes = self.model.get_children(v_transpose_node) - - if len(k_transpose_nodes) != 1 or len(v_transpose_nodes) != 1: - return - - k_concat_node = k_concat_nodes[0] - v_concat_node = v_concat_nodes[0] - - if v_concat_node.attribute[0].i != 2 and k_concat_node.attribute[0].i != 2: #axis = 2 - return - - k_path_nodes.append(k_concat_node) - v_path_nodes.append(v_concat_node) - - k_cache_input = k_concat_node.input[0] - if k_transpose_node.output[0] == k_concat_node.input[0]: - k_cache_input = k_concat_node.input[1] - k_cache_output = k_concat_node.output[0] - - - - v_cache_input = v_concat_node.input[0] - if v_transpose_node.output[0] == v_concat_node.input[0]: - v_cache_input = v_concat_node.input[1] - v_cache_output = v_concat_node.output[0] - - - plugin_inputs = [split_node.input[0],k_cache_input,v_cache_input] - plugin_outputs = [q_transpose_node.output[0], k_cache_output,v_cache_output] - remove_nodes = [split_node, q_reshape_node,q_transpose_node] - - remove_nodes.extend(k_path_nodes) - remove_nodes.extend(v_path_nodes) - new_node,k_cache_concat_node, v_cache_concat_node= self.create_node(plugin_inputs, plugin_outputs) - - self.nodes_to_add.append(new_node) - self.nodes_to_add.append(k_cache_concat_node) - self.nodes_to_add.append(v_cache_concat_node) - - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.node_name_to_graph_name[k_cache_concat_node.name] = self.this_graph_name - self.node_name_to_graph_name[v_cache_concat_node.name] = self.this_graph_name - - self.nodes_to_remove.extend(remove_nodes) - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py deleted file mode 100644 index c2dd243357fac20057d67551c0d3d9d86b15dc68..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py +++ /dev/null @@ -1,389 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionCustomFCGPT2(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Reshape"], "gpt2") - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - nodes = self.model.match_parent_path(node, ["Gemm", "Reshape"], [0, 0]) - - if nodes is None: - return False - - (matmul, reshape_before_matmul) = nodes - - matmul_weight = self.model.get_initializer(matmul.input[1]) - matmul_bias = self.model.get_initializer(matmul.input[2]) - - if matmul_weight is None or matmul_bias is None: - return False - - w = NumpyHelper.to_array(matmul_weight) - b = NumpyHelper.to_array(matmul_bias) - - transB = 0 - for attr in matmul.attribute: - if attr.name == "transB": - transB = attr.i - break - - trans_matmul_weight = w - if transB == 0: - trans_matmul_weight = w.transpose(1, 0) - if matmul_weight.name not in self.model.initializer_visited.keys(): - self.model.initializer_visited[matmul_weight.name] = True - if matmul_weight.data_type == 10: - matmul_weight.CopyFrom( - numpy_helper.from_array( - trans_matmul_weight.astype(np.float16), matmul_weight.name - ) - ) - else: - matmul_weight.CopyFrom( - numpy_helper.from_array(trans_matmul_weight, matmul_weight.name) - ) - - if matmul_bias.data_type == 10: - matmul_bias.CopyFrom( - numpy_helper.from_array(b.astype(np.float16), matmul_bias.name) - ) - else: - matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name)) - - fused_node = helper.make_node( - "CustomFCPluginDynamic_IxRT", - inputs=[reshape_before_matmul.input[0]], - outputs=node.output, - name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])]) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)]) - fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)]) - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("act_type", -1)]) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - self.nodes_to_add.append(fused_node) - self.nodes_to_remove.extend([matmul, node, reshape_before_matmul]) - - -class FusionCustomFcRoformer(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"], "roformer fc") - - # For model Roformer. - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - if len(node.input) != 2: - return False - - fc_paths = { - "path1": (["Reshape", "MatMul", "Reshape"], [0, 0, 0]), - "path2": (["Reshape", "MatMul", "Reshape"], [1, 0, 0]), - } - - nodes, paths = self.match_parent_path_from_dict(node, fc_paths) - if nodes is None: - return False - - reshape_after_matmul = nodes[0] - matmul = nodes[1] - reshape_before_matmul = nodes[2] - - reshape_before_shape = None - reshape_after_shape = None - for value_info in self.model.graph().value_info: - if value_info.name == reshape_before_matmul.input[0]: - reshape_before_shape = len(value_info.type.tensor_type.shape.dim) - break - for value_info in self.model.graph().value_info: - if value_info.name == reshape_after_matmul.output[0]: - reshape_after_shape = len(value_info.type.tensor_type.shape.dim) - break - if reshape_before_shape != reshape_after_shape: - return False - - weight = self.model.get_initializer(matmul.input[1]) - bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer( - node.input[0] - ) - - if weight is None or bias is None: - return False - - w = NumpyHelper.to_array(weight) - w_in_size = w.shape[0] - weight_dim = np.prod(w.shape[1:]) - - b = NumpyHelper.to_array(bias) - bias_dim = np.prod(b.shape) - trans_matmul_weight = w.transpose(1, 0) - weight.CopyFrom(onnx.numpy_helper.from_array(trans_matmul_weight, weight.name)) - # Sometimes weights and bias are stored in fp16 - if weight.data_type == 10: - weight.CopyFrom( - numpy_helper.from_array( - trans_matmul_weight.astype(np.float16), weight.name - ) - ) - bias_arr = onnx.numpy_helper.to_array(bias).flatten() - bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name)) - if bias.data_type == 10: - bias.CopyFrom( - numpy_helper.from_array( - NumpyHelper.to_array(bias).astype(np.float16), bias.name - ) - ) - - fused_node = helper.make_node( - "CustomFCPluginDynamic_IxRT", - inputs=[reshape_before_matmul.input[0]], - outputs=node.output, - name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])]) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - fused_node.attribute.extend([helper.make_attribute("W", weight)]) - fused_node.attribute.extend([helper.make_attribute("B", bias)]) - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("act_type", -1)]) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - self.nodes_to_add.append(fused_node) - - self.nodes_to_remove.extend([node]) - self.nodes_to_remove.extend(nodes) - return True - - -class FusionCustomFC(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"]) - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - if self.fuse_1(node, input_name_to_nodes, output_name_to_node): - return - - def fuse_1(self, node, input_name_to_nodes, output_name_to_node): - if len(node.input) != 2: - return False - nodes = self.model.match_parent_path(node, ["MatMul"], [None]) - - if nodes is None: - return False - matmul = nodes[0] - - matmul_weight = self.model.get_initializer(matmul.input[1]) - matmul_bias = self.model.get_initializer( - node.input[1] - ) or self.model.get_initializer(node.input[0]) - - if matmul_weight is None or matmul_bias is None: - return False - - w = NumpyHelper.to_array(matmul_weight) - b = NumpyHelper.to_array(matmul_bias) - - trans_matmul_weight = w.transpose(1, 0) - if matmul_weight.name not in self.model.initializer_visited.keys(): - self.model.initializer_visited[matmul_weight.name] = True - if matmul_weight.data_type == 10: - matmul_weight.CopyFrom( - numpy_helper.from_array( - trans_matmul_weight.astype(np.float16), matmul_weight.name - ) - ) - else: - matmul_weight.CopyFrom( - numpy_helper.from_array(trans_matmul_weight, matmul_weight.name) - ) - - if matmul_bias.data_type == 10: - matmul_bias.CopyFrom( - numpy_helper.from_array(b.astype(np.float16), matmul_bias.name) - ) - else: - matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name)) - - fused_node = helper.make_node( - "CustomFCPluginDynamic_IxRT", - inputs=[matmul.input[0]], - outputs=node.output, - name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])]) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)]) - fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)]) - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("act_type", -1)]) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - self.nodes_to_add.append(fused_node) - self.nodes_to_remove.extend([matmul, node]) - return True - - -class FusionCustomFCActivation(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, - "CustomFCPluginDynamic_IxRT", - ["Gelu", "Relu", "CustomGeluPluginDynamic_IxRT", "Mul"], - "with activation", - ) - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - if node.op_type == "Mul": - return_indice = [] - nodes = self.model.match_parent_path( - node, - ["Sigmoid", "Mul", "CustomFCPluginDynamic_IxRT"], - [None, 0, 0], - return_indice=return_indice, - ) - if nodes is None: - return - - (sigmoid_node, mul_node, custom_fc_node) = nodes - if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node: - return - - activation_type = 20 - for attr in custom_fc_node.attribute: - if attr.name == "act_type": - attr.i = activation_type - break - - custom_fc_node.output[0] = node.output[0] - self.nodes_to_add.append(custom_fc_node) - self.nodes_to_remove.extend([node, sigmoid_node, mul_node, custom_fc_node]) - self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name - else: - nodes = self.model.match_parent_path( - node, ["CustomFCPluginDynamic_IxRT"], [0] - ) - - if nodes is None: - logger.debug("CustomFCActivation: failed to match fc+gelu/relu path") - return - - fc_node = nodes[0] - activation_type = 3 - if node.op_type == "Gelu": - activation_type = 3 - if node.op_type == "Relu": - activation_type = 4 - - for attr in fc_node.attribute: - if attr.name == "act_type": - attr.i = activation_type - break - - fc_node.output[0] = node.output[0] - self.nodes_to_add.append(fc_node) - self.nodes_to_remove.extend([node, fc_node]) - self.node_name_to_graph_name[fc_node.name] = self.this_graph_name - - -class FusionConformerCustomFCActivation(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, - "CustomFCPluginDynamic_IxRT", - ["Mul"], - "with activation", - ) - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - # return_indice = [] - nodes = self.model.match_parent_path( - node, - ["Sigmoid", "CustomFCPluginDynamic_IxRT"], - [ - None, - 0, - ], - # return_indice=return_indice, - ) - if nodes is None: - return - (sigmoid_node, custom_fc_node) = nodes - # if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node: - # return - activation_type = 20 - for attr in custom_fc_node.attribute: - if attr.name == "act_type": - attr.i = activation_type - break - custom_fc_node.attribute.extend([helper.make_attribute("swish_alpha", 1.0)]) - custom_fc_node.output[0] = node.output[0] - self.nodes_to_add.append(custom_fc_node) - self.nodes_to_remove.extend([node, sigmoid_node, custom_fc_node]) - self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name - - -class FusionTorchvisionVitCustomFC(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "CustomFCPluginDynamic_IxRT", ["CustomQKVToContextPluginDynamic_IxRT"], "torchvision vit custom_fc",) - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - custom_fc_node_0 = self.model.get_children(node, input_name_to_nodes) - transpose_node_0 = self.model.get_children(custom_fc_node_0[0], input_name_to_nodes) - - if transpose_node_0[0].op_type != "Transpose": - return - - custom_fc_node_0[0].output[0] = transpose_node_0[0].output[0] - - nodes = self.model.match_parent_path(node, ["CustomFCPluginDynamic_IxRT","Transpose"], [0, 0]) - if nodes is None: - return - - (custom_fc_node_1, transpose_node_1) = nodes - custom_fc_node_1.input[0] = transpose_node_1.input[0] - - self.nodes_to_add.append(custom_fc_node_1) - self.nodes_to_add.append(custom_fc_node_0[0]) - self.nodes_to_remove.extend([transpose_node_1, custom_fc_node_1, transpose_node_0[0], custom_fc_node_0[0]]) - self.node_name_to_graph_name[custom_fc_node_1.name] = self.this_graph_name - self.node_name_to_graph_name[custom_fc_node_0[0].name] = self.this_graph_name - \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py deleted file mode 100644 index 670a767e18e3ccd13d5540c9a415aa3ad8fc7525..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import List, Tuple, Union - -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionDisentangledAttention(Fusion): - """ - Match Disentangled Attention - ------------------------------------------- - | - GatherElements --> Add --> Add --> - | - GatherElements --> Transpose -> - """ - - def __init__(self, model: OnnxModel): - super().__init__(model, "DisentangledAttention_IxRT", "Add") - - def create_disentangled_attention_node( - self, - inputs: List[str], - outputs: List[str], - ) -> Union[NodeProto, None]: - """Create an disentangled attention node. - - Args: - inputs List[str]: data input names - outputs List[str]: data output names - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - disentangled_attention_node_name = self.model.create_node_name( - "DisentangledAttention" - ) - - disentangled_attention_node = helper.make_node( - "DisentangledAttention_IxRT", - inputs=inputs, - outputs=outputs, - name=disentangled_attention_node_name, - ) - disentangled_attention_node.domain = "com.iluvatar" - disentangled_attention_node.attribute.extend( - [helper.make_attribute("plugin_namespace", "")] - ) - disentangled_attention_node.attribute.extend( - [helper.make_attribute("plugin_version", "1")] - ) - disentangled_attention_node.attribute.extend( - [helper.make_attribute("factor", 0.1)] - ) - disentangled_attention_node.attribute.extend( - [helper.make_attribute("span", 512)] - ) - - return disentangled_attention_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - disentangled_attention_path1 = { - "path": (["Add", "GatherElements", "MatMul"], [None, None, None]), - } - - disentangled_attention_path2 = { - "path": ( - ["Add", "Transpose", "GatherElements", "MatMul"], - [None, None, None, None], - ), - } - - nodes1, _ = self.match_parent_path_from_dict(node, disentangled_attention_path1) - nodes2, _ = self.match_parent_path_from_dict(node, disentangled_attention_path2) - - if nodes1 is not None and nodes2 is not None: - if nodes1[0] == nodes2[0]: - (head_add, first_gather, first_matmul) = nodes1 - (_, transpose, second_gather, second_matmul) = nodes2 - tail_add = node - - first_input = [i for i in tail_add.input if i != head_add.output[0]][0] - second_input = first_matmul.output[0] - third_input = second_matmul.output[0] - output = tail_add.output[0] - - disentangled_attention_node = self.create_disentangled_attention_node( - [first_input, second_input, third_input], [output] - ) - self.nodes_to_add.append(disentangled_attention_node) - self.node_name_to_graph_name[ - disentangled_attention_node.name - ] = self.this_graph_name - self.nodes_to_remove.append(tail_add) - self.nodes_to_remove.append(head_add) - self.nodes_to_remove.append(first_gather) - self.nodes_to_remove.append(transpose) - self.nodes_to_remove.append(second_gather) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py deleted file mode 100644 index f46fa2c77da83612a25dd7bde215f20e70845ff7..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py +++ /dev/null @@ -1,1078 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Dict, List, Tuple, Union - -from onnx import NodeProto, TensorProto, helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionEmbedLayerNoMask(Fusion): - """ - Fuse embedding layer into one node (EmbedLayerNormalization). - It supports the following model types: BERT, DistilBert, ALBert. - """ - - def __init__(self, model: OnnxModel, description: str = "no mask"): - super().__init__( - model, - "EmbedLayerNormalization", - ["LayerNormalization", "SkipLayerNormalization"], - description, - ) - self.utils = FusionUtils(model) - self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True) - # The following will be reset in each fuse call of FusionEmbedLayerNormalization - self.attention = None - self.embed_node = None - - def match_two_gather( - self, add: NodeProto - ) -> Union[None, Tuple[NodeProto, NodeProto]]: - gather_0_path = self.model.match_parent_path(add, ["Gather"], [0]) - if gather_0_path is None: - return None - - gather_1_path = self.model.match_parent_path(add, ["Gather"], [1]) - if gather_1_path is None: - return None - - return gather_0_path[0], gather_1_path[0] - - def check_attention_subgraph( - self, - layernorm: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - is_distil_bert: bool, - ) -> bool: - """Check that LayerNormalization has a child of Attention node or subgraph like Attention. - - Args: - layernorm (NodeProto): LayerNormalization node - input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes - is_distil_bert (bool): whether it is DistilBert or not - - Returns: - bool: whether there is Attention node or subgraph like Attention - """ - self.attention = self.model.find_first_child_by_type( - layernorm, "Attention", input_name_to_nodes, recursive=False - ) - if self.attention is None: - # In case user disables attention fusion, check whether subgraph looks like Attention. - if layernorm.output[0] not in input_name_to_nodes: - return False - children = input_name_to_nodes[layernorm.output[0]] - - # For Albert, there is MatMul+Add after embedding layer before attention. - if ( - len(children) == 1 - and children[0].op_type == "MatMul" - and children[0].output[0] in input_name_to_nodes - ): - grandchildren = input_name_to_nodes[children[0].output[0]] - if ( - len(grandchildren) == 1 - and grandchildren[0].op_type == "Add" - and grandchildren[0].output[0] in input_name_to_nodes - ): - nodes = input_name_to_nodes[grandchildren[0].output[0]] - for node in nodes: - if node.op_type == "Attention": - self.attention = node - return True - children_types = sorted([child.op_type for child in nodes]) - else: - children_types = sorted([child.op_type for child in children]) - - # Two Shape nodes might be merged by ORT - if is_distil_bert: - # SkipLayerNormailization might exist when model has been optimized by ORT first. - if ( - children_types - != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"] - and children_types - != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"] - and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"] - ): - logger.debug( - "No Attention like subgraph in children of LayerNormalization" - ) - return False - else: - if children_types != [ - "Add", - "MatMul", - "MatMul", - "MatMul", - ] and children_types != [ - "MatMul", - "MatMul", - "MatMul", - "SkipLayerNormalization", - ]: - logger.debug( - "No Attention like subgraph in children of LayerNormalization" - ) - return False - return True - - def match_position_embedding_distilbert( - self, position_embedding_gather, input_ids, output_name_to_node - ): - """ Match position embedding path from input_ids to Gather for DistilBert. - - Pattern is like the following: - (input_ids) - | - Shape - | \ - | Gather (indices=1) - | | - | Cast (optional) - | | - | Range (start=0, end=*, delta=1) - | | - | Unsqueeze - | / - Expand - | - Gather - """ - # remove after tests pass - path1 = self.model.match_parent_path( - position_embedding_gather, ["Expand", "Shape"], [1, 1] - ) - if path1 is None: - path1 = self.model.match_parent_path( - position_embedding_gather, - ["Expand", "Where", "Reshape", "Shape"], - [1, 1, 2, 0], - ) - if path1 is None: - return False - - expand, shape = path1[0], path1[-1] - if shape.input[0] != input_ids: - return False - - _, path2, _ = self.model.match_parent_paths( - expand, - [ - (["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]), - (["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]), - ], - output_name_to_node, - ) - if path2 is None: - return False - - range_node = path2[1] - if not ( - self.utils.check_node_input_value(range_node, 0, 0) - and self.utils.check_node_input_value(range_node, 2, 1) - ): - return False - - gather_node = path2[-2] - if not (self.utils.check_node_input_value(gather_node, 1, 1)): - return False - - shape_node = path2[-1] - if shape_node.input[0] != input_ids: - return False - - return True - - def match_position_embedding_roberta( - self, position_embedding_gather, input_ids, output_name_to_node - ): - """Match position embedding path from input_ids to Gather for Roberta. - - Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id): - (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather - | ^ - V | - +------------------------------+ - - Roberta new pattern from transformers v4.9: - (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather - | ^ - V | - +-------------------------------------------+ - - start_node = position_embedding_gather - start_index = 1 - - # match optional Cast node. - parent = self.model.get_parent(start_node, start_index, output_name_to_node) - if parent is None: - return - if parent.op_type == "Cast": - if OnnxModel.get_node_attribute(parent, "to") != 7: - return - start_node = parent - start_index = 0 - - i, path, return_indices = self.model.match_parent_paths( - start_node, - [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]), - (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])], - output_name_to_node) - - if path is not None: - # constant input of Add shall be 1. - i, value = self.model.get_constant_input(path[0]) - if value != 1: - return False - - _, self.padding_word_id = self.model.get_constant_input(path[-1]) - - return input_ids == path[-1].input[0] - """ - - return False - - def match_position_embedding_bert( - self, position_embedding_gather, input_ids, output_name_to_node - ): - """ Match position embedding path from input_ids to Gather for BERT. - - BERT Embedding Layer Pattern: - (input_ids) - / \ - / Shape - / | - / Gather (indices=1) - / | - / Add (optional, B=0) - / | - Gather (segment_ids) Unsqueeze (axes=0) - \ | | - \ Gather Slice (data[1,512], starts=0, ends=*, axes=1, steps=1) - \ / | - Add Gather - \ / - Add - | - LayerNormalization - """ - path = self.model.match_parent_path( - position_embedding_gather, - ["Slice", "Unsqueeze"], - [1, 2], - output_name_to_node, - ) - if path is None: - return False - - slice, unsqueeze = path - slice_weight = self.model.get_constant_value(slice.input[0]) - if not ( - slice_weight is not None - and len(slice_weight.shape) == 2 - and slice_weight.shape[0] == 1 - and self.utils.check_node_input_value(slice, 1, [0]) - and self.utils.check_node_input_value(slice, 3, [1]) - and ( - len(slice.input) == 4 - or self.utils.check_node_input_value(slice, 4, [1]) - ) - ): - return False - - opset_version = self.model.get_opset_version() - if opset_version < 13: - if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]): - return False - else: - if not self.utils.check_node_input_value(unsqueeze, 1, [0]): - return False - - node = self.model.get_parent(unsqueeze, 0, output_name_to_node) - if node is None: - return False - if node.op_type == "Add": - if not self.utils.check_node_input_value(node, 1, 0): - return False - gather = self.model.get_parent(node, 0, output_name_to_node) - else: - gather = node - - if gather is None or gather.op_type != "Gather": - return False - if not (self.utils.check_node_input_value(gather, 1, 1)): - return False - - shape = self.model.get_parent(gather, 0, output_name_to_node) - if shape is None or shape.op_type != "Shape": - return False - - return input_ids == shape.input[0] - - def match_position_embedding( - self, position_embedding_gather, input_ids, output_name_to_node - ): - if self.match_position_embedding_bert( - position_embedding_gather, input_ids, output_name_to_node - ): - return True - - # TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel - # related: https://github.com/huggingface/transformers/issues/10736 - # if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node): - # return True - - if self.match_position_embedding_distilbert( - position_embedding_gather, input_ids, output_name_to_node - ): - return True - - return False - - def check_embedding( - self, word_embedding_gather, segment_embedding_gather, position_embedding_gather - ): - """Sanity check of embedding weights, and match hidden_size of weights and shape of inputs.""" - input_ids = word_embedding_gather.input[1] - segment_ids = ( - segment_embedding_gather.input[1] if segment_embedding_gather else None - ) - position_ids = position_embedding_gather.input[1] - - if self.shape_infer_helper is not None: - input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids) - position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids) - assert input_ids_shape and position_ids_shape - if not ( - len(input_ids_shape) == 2 - and len(position_ids_shape) == 2 - and input_ids_shape[1] == position_ids_shape[1] - ): - logger.info( - "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format( - input_ids_shape, position_ids_shape - ) - ) - return False - - if segment_ids and not self.shape_infer_helper.compare_shape( - input_ids, segment_ids - ): - logger.info( - "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format( - input_ids_shape, - self.shape_infer_helper.get_edge_shape(segment_ids), - ) - ) - return False - - word_embedding_table = self.model.get_constant_value( - word_embedding_gather.input[0] - ) - if word_embedding_table is None or len(word_embedding_table.shape) != 2: - logger.info( - "Cannot fuse EmbedLayerNormalization: word embedding table is not expected" - ) - return False - - position_embedding_table = self.model.get_constant_value( - position_embedding_gather.input[0] - ) - if ( - position_embedding_table is None - or len(position_embedding_table.shape) != 2 - or (word_embedding_table.shape[1] != position_embedding_table.shape[1]) - ): - logger.info( - "Cannot fuse EmbedLayerNormalization: position embedding table is not expected" - ) - return False - - if segment_ids: - segment_embedding_table = self.model.get_constant_value( - segment_embedding_gather.input[0] - ) - if ( - segment_embedding_table is None - or len(segment_embedding_table.shape) != 2 - or (word_embedding_table.shape[1] != segment_embedding_table.shape[1]) - ): - logger.info( - "Cannot fuse EmbedLayerNormalization: segment embedding table is not expected" - ) - return False - - # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between. - # TODO: use other information (like initializer names) to identify different embedding weights automatically. - if word_embedding_table.shape[0] <= position_embedding_table.shape[0]: - logger.warning( - f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]}" - ) - - if segment_ids: - if word_embedding_table.shape[0] <= segment_embedding_table.shape[0]: - logger.warning( - f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}" - ) - - if position_embedding_table.shape[0] <= segment_embedding_table.shape[0]: - logger.warning( - f"position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}" - ) - - return True - - def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]: - """Cast a graph input or node input to int32. - - Args: - input_name (str): name of graph input or node input - - Returns: - A tuple of casted input name and the cast node. - int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node. - input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32. - """ - input_cast_node = None - graph_input = self.model.find_graph_input(input_name) - if graph_input is not None: - if graph_input.type.tensor_type.elem_type != TensorProto.INT32: - int32_output, input_cast_node = self.utils.cast_input_to_int32( - input_name - ) - else: - int32_output = input_name - else: - int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name) - - return int32_output, input_cast_node - - def create_fused_node( - self, - input_ids: str, - layernorm: NodeProto, - word_embedding_gather: NodeProto, - position_embedding_gather: NodeProto, - segment_embedding_gather: Union[None, NodeProto], - position_ids: str = None, - embedding_sum_output=False, - ): - """Create an EmbedLayerNormalization node. Note that segment embedding is optional. - - Args: - input_ids (str): input_ids for word embeddings - layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node. - word_embedding_gather (NodeProto): the Gather node for word embedding - position_embedding_gather (NodeProto): the Gather node for position embedding - segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None. - - Returns: - NodeProto: the EmbedLayerNormalization node created. - """ - nodes_to_add = [] - input_ids, _ = self.cast_to_int32(input_ids) - - node_name = self.model.create_node_name("EmbedLayerNormalization") - - if layernorm.op_type == "LayerNormalization": - gamma = layernorm.input[1] - beta = layernorm.input[2] - else: # SkipLayerNormalization - gamma = layernorm.input[2] - beta = layernorm.input[3] - - embed_node_inputs = None - if segment_embedding_gather is not None: - segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1]) - - embed_node_inputs = [ - input_ids, - segment_ids, - word_embedding_gather.input[0], - position_embedding_gather.input[0], - segment_embedding_gather.input[0], - gamma, - beta, - ] - else: # no segment embedding - embed_node_inputs = [ - input_ids, - "", - word_embedding_gather.input[0], - position_embedding_gather.input[0], - "", - gamma, - beta, - ] - - if position_ids is not None: - # Adding an empty input for mask before position_ids - embed_node_inputs.append("") - position_ids, _ = self.cast_to_int32(position_ids) - embed_node_inputs.append(position_ids) - - embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"] - if embedding_sum_output: - embed_node_outputs.append(node_name + "_embedding_sum") - - embed_node = helper.make_node( - "EmbedLayerNormalization", - embed_node_inputs, - outputs=embed_node_outputs, - name=node_name, - ) - - embed_node.domain = "com.microsoft" - - # Pass attribute "epsilon" from normalize node to EmbedLayerNormalization. - for att in layernorm.attribute: - if att.name == "epsilon": - embed_node.attribute.extend([att]) - - # Set default value to 1e-12 if no attribute is found. - # OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later. - if len(embed_node.attribute) == 0: - embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)]) - - # Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add. - nodes_to_add.append(embed_node) - for node in nodes_to_add: - self.node_name_to_graph_name[node.name] = self.this_graph_name - self.nodes_to_add.extend(nodes_to_add) - - self.embed_node = embed_node - return embed_node - - def finish_fusion(self, layernorm, embed_node): - self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0]) - # use prune graph to remove nodes that is not needed - self.prune_graph = True - - def is_embedding_sum_needed(self, add_before_layer_norm): - """Check that Add before layer norm has an output to add before next layernorm - - Args: - add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph - - Returns: - bool: whether there is an extra output needed out of embed layer norm node - """ - - nodes = self.model.get_children(add_before_layer_norm) - - return len(nodes) > 1 - - def fuse_gpt2( - self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node - ): - # graph checks - # gpt2 has no segment embedding, subgraph pattern is like - # input_ids position_ids - # | | - # Gather Gather - # \ / - # Add _ _ _ _ _ - # | | - # LayerNormalization | - # | | - # Attention | - # | | - # Matmul | - # | / - # Add / - # \ / - # Add - two_gather = self.match_two_gather(add_before_layernorm) - if two_gather is None: - return False - - add_output = add_before_layernorm.output[0] - - word_embedding_gather, position_embedding_gather = two_gather - input_ids = word_embedding_gather.input[1] - position_ids = position_embedding_gather.input[1] - - if not self.check_attention_subgraph( - layernorm, input_name_to_nodes, is_distil_bert=False - ): - return False - - if not self.check_embedding( - word_embedding_gather, None, position_embedding_gather - ): - return False - - optional_embedding_sum_output = False - if self.is_embedding_sum_needed(add_before_layernorm): - optional_embedding_sum_output = True - - # make the fused node - embed_node = self.create_fused_node( - input_ids, - layernorm, - word_embedding_gather, - position_embedding_gather, - None, - position_ids, - optional_embedding_sum_output, - ) - - # direct the output to another add too - self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0]) - if optional_embedding_sum_output: - self.model.replace_input_of_all_nodes(add_output, embed_node.output[2]) - - return True - - def fuse_distilbert( - self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node - ): - """Fuse embedding layer for DistilBert - Args: - layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization - add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself - input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes - output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes - """ - - # DistilBert has no segment embedding, subgraph pattern is like - # input_ids - # | \ - # | (position_embedding_subgraph) - # | | - # Gather Gather - # \ / - # Add - # | - # LayerNormalization - two_gather = self.match_two_gather(add_before_layernorm) - if two_gather is None: - return False - - word_embedding_gather, position_embedding_gather = two_gather - input_ids = word_embedding_gather.input[1] - - if not self.check_attention_subgraph( - layernorm, input_name_to_nodes, is_distil_bert=True - ): - return False - - if not self.match_position_embedding( - position_embedding_gather, input_ids, output_name_to_node - ): - return False - - if not self.check_embedding( - word_embedding_gather, None, position_embedding_gather - ): - return False - - embed_node = self.create_fused_node( - input_ids, layernorm, word_embedding_gather, position_embedding_gather, None - ) - self.finish_fusion(layernorm, embed_node) - return True - - def fuse_bert( - self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node - ): - """Fuse embedding layer for Bert - Args: - layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization - add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself - input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes - output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes - """ - - add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0]) - if add_2_gather is None: - return False - - two_gather = self.match_two_gather(add_2_gather[0]) - if two_gather is None: - return False - - word_embedding_gather, segment_embedding_gather = two_gather - - input_ids = word_embedding_gather.input[1] - - if not self.check_attention_subgraph( - layernorm, input_name_to_nodes, is_distil_bert=False - ): - return False - - position_embedding_path = self.model.match_parent_path( - add_before_layernorm, ["Gather"], [1] - ) - if position_embedding_path is None: - return False - - position_embedding_gather = position_embedding_path[0] - if not self.match_position_embedding( - position_embedding_gather, input_ids, output_name_to_node - ): - if not self.match_position_embedding( - segment_embedding_gather, input_ids, output_name_to_node - ): - return False - # position and segment are switched - temp = segment_embedding_gather - segment_embedding_gather = position_embedding_gather - position_embedding_gather = temp - - if not self.check_embedding( - word_embedding_gather, segment_embedding_gather, position_embedding_gather - ): - return False - - embed_node = self.create_fused_node( - input_ids, - layernorm, - word_embedding_gather, - position_embedding_gather, - segment_embedding_gather, - ) - self.finish_fusion(layernorm, embed_node) - return True - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - if node.op_type == "LayerNormalization": - first_add_path = self.model.match_parent_path(node, ["Add"], [0]) - if first_add_path is None: - return - add_before_layernorm = first_add_path[0] - else: # SkipLayerNormalization - add_before_layernorm = node # Add is fused into SkipLayerNormalization - - if self.fuse_gpt2( - node, add_before_layernorm, input_name_to_nodes, output_name_to_node - ): - return - - if self.fuse_distilbert( - node, add_before_layernorm, input_name_to_nodes, output_name_to_node - ): - return - - if self.fuse_bert( - node, add_before_layernorm, input_name_to_nodes, output_name_to_node - ): - return - - -class FusionEmbedLayerNormalization(FusionEmbedLayerNoMask): - def __init__(self, model: OnnxModel): - super().__init__(model, "with mask") - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - # Reset attention and embed_node so that we know fusion is successful when they are not None. - self.attention = None - self.embed_node = None - super().fuse(node, input_name_to_nodes, output_name_to_node) - - if self.attention and self.embed_node: - mask_index = self.attention.input[3] - if mask_index in output_name_to_node: - node = output_name_to_node[mask_index] - if node.op_type == "ReduceSum": - embed_node = self.embed_node - mask_input_name = node.input[0] - self.nodes_to_remove.extend([node]) - embed_node.input.append(mask_input_name) - embed_node.output[1] = mask_index - - -class FusionBertEmbedLayerNormalization(Fusion): - """ - Fuse BertEmbedLayerNormalization subgraph into one node. - """ - - def __init__(self, model: OnnxModel): - super().__init__( - model, "CustomEmbLayerNormPluginDynamic_IxRT", "CustomQKVToContextPluginDynamic_IxRT" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - input --> CustomEmbLayerNormPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT --> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT - """ - children = self.model.get_children(node, input_name_to_nodes) - parent = self.model.get_parents(node, output_name_to_node) - - if len(children) == 0: - return - if len(parent) == 0: - return - - start_node = node - - # word_embeddings - word_embeddings_node = self.model.match_parent_path( - start_node, - ["CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"], - [0, 0, 0, 0, 0], - output_name_to_node, - ) - - # token_type_embeddings - token_type_embeddings_node = self.model.match_parent_path( - start_node, - ["CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"], - [0, 0, 0, 0, 1], - output_name_to_node, - ) - - # attention_mask - attention_mask_node = self.model.match_parent_path( - start_node, - ["Mul", "Sub", "Cast", "Unsqueeze"], - [1, 0, 1, 0], - output_name_to_node, - ) - - if word_embeddings_node is None or token_type_embeddings_node is None or attention_mask_node is None: - return - - if word_embeddings_node and token_type_embeddings_node and attention_mask_node: - subgraph_nodes = [] - subgraph_nodes.extend(word_embeddings_node) - subgraph_nodes.extend(token_type_embeddings_node) - subgraph_nodes.extend(attention_mask_node) - - subgraph_nodes_unique = [] - for item in subgraph_nodes: - if item not in subgraph_nodes_unique: - subgraph_nodes_unique.append(item) - subgraph_nodes_remove = [] - for item in subgraph_nodes_unique: - if item.op_type != "CustomFCPluginDynamic_IxRT": - subgraph_nodes_remove.append(item) - - # input_ids = self.model.get_graph_inputs_excluding_initializers()[0] - # token_type_ids = self.model.get_graph_inputs_excluding_initializers()[1] - # attention_mask = self.model.get_graph_inputs_excluding_initializers()[2] - - emblayernorm_out = word_embeddings_node[1].output[0] - emblayernorm_out_mask = attention_mask_node[0].output[0] - - # self.model.modify_node_output_type(emblayernorm_out_mask, 5) - - beta_data = self.model.get_initializer(word_embeddings_node[1].input[2], True) - embeddings_layernorm_beta_name = "bert_embeddings_layernorm_beta" - embeddings_layernorm_beta = helper.make_tensor( - embeddings_layernorm_beta_name, TensorProto.FLOAT, beta_data.shape, beta_data.flatten().tolist()) - - gamma_data = self.model.get_initializer(word_embeddings_node[1].input[1], True) - embeddings_layernorm_gamma_name = "bert_embeddings_layernorm_gamma" - embeddings_layernorm_gamma = helper.make_tensor( - embeddings_layernorm_gamma_name, TensorProto.FLOAT, gamma_data.shape, gamma_data.flatten().tolist()) - - embeddings_word_embeddings_data = self.model.get_initializer(word_embeddings_node[4].input[0], True) - embeddings_word_embeddings_name = "bert_embeddings_word_embeddings" - embeddings_word_embeddings = helper.make_tensor( - embeddings_word_embeddings_name, TensorProto.FLOAT, embeddings_word_embeddings_data.shape, - embeddings_word_embeddings_data.flatten().tolist()) - - embeddings_token_type_embeddings_data = self.model.get_initializer(token_type_embeddings_node[4].input[0], True) - embeddings_token_type_embeddings_name = "bert_embeddings_token_type_embeddings" - embeddings_token_type_embeddings = helper.make_tensor( - embeddings_token_type_embeddings_name, TensorProto.FLOAT, embeddings_token_type_embeddings_data.shape, - embeddings_token_type_embeddings_data.flatten().tolist()) - - embeddings_position_embeddings_data = self.model.get_initializer(token_type_embeddings_node[2].input[1], True) - embeddings_position_embeddings_name = "bert_embeddings_token_type_embeddings" - embeddings_position_embeddings = helper.make_tensor( - embeddings_position_embeddings_name, TensorProto.FLOAT, embeddings_position_embeddings_data.shape, - embeddings_position_embeddings_data.flatten().tolist()) - - self.model.add_initializer(embeddings_layernorm_beta, self.this_graph_name) - self.model.add_initializer(embeddings_layernorm_gamma, self.this_graph_name) - self.model.add_initializer(embeddings_word_embeddings, self.this_graph_name) - self.model.add_initializer(embeddings_token_type_embeddings, self.this_graph_name) - self.model.add_initializer(embeddings_position_embeddings, self.this_graph_name) - - - emblayernorm_node = helper.make_node( - "CustomEmbLayerNormPluginDynamic_IxRT", - inputs=[word_embeddings_node[4].input[1], token_type_embeddings_node[4].input[1], attention_mask_node[3].input[0]], - outputs=[emblayernorm_out, emblayernorm_out_mask], - name=self.model.create_node_name( - "BertEmbedLayerNormalization", name_prefix="BertEmbedLayerNormalization" - ), - ) - emblayernorm_node.domain = "com.iluvatar" - emblayernorm_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - emblayernorm_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - emblayernorm_node.attribute.extend([helper.make_attribute("output_fp16", 1)]) - emblayernorm_node.attribute.extend([helper.make_attribute("full_mask", 1)]) - emblayernorm_node.attribute.extend([helper.make_attribute("mha_type_id", 2)]) - emblayernorm_node.attribute.extend([helper.make_attribute("pad_id", 0)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_beta", embeddings_layernorm_beta)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_gamma", embeddings_layernorm_gamma)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_word_embeddings", embeddings_word_embeddings)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_token_type_embeddings", embeddings_token_type_embeddings)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_position_embeddings", embeddings_position_embeddings)]) - - self.nodes_to_remove.extend(subgraph_nodes_remove) - - self.nodes_to_add.append(emblayernorm_node) - self.node_name_to_graph_name[emblayernorm_node.name] = self.this_graph_name - - -class FusionAlbertEmbedLayerNormalization(Fusion): - """ - Fuse AlbertEmbedLayerNormalization subgraph into one node. - """ - - def __init__(self, model: OnnxModel): - super().__init__( - model, "CustomEmbLayerNormPluginDynamic_IxRT", "CustomQKVToContextPluginDynamic_IxRT" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - input --> CustomEmbLayerNormPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT --> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT - """ - children = self.model.get_children(node, input_name_to_nodes) - parent = self.model.get_parents(node, output_name_to_node) - - if len(children) == 0: - return - if len(parent) == 0: - return - - start_node = node - - # word_embeddings - word_embeddings_node = self.model.match_parent_path( - start_node, - ["CustomFCPluginDynamic_IxRT","CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"], - [0, 0, 0, 0, 0, 0], - output_name_to_node, - ) - - # token_type_embeddings - token_type_embeddings_node = self.model.match_parent_path( - start_node, - ["CustomFCPluginDynamic_IxRT","CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"], - [0, 0, 0, 0, 0, 1], - output_name_to_node, - ) - - # attention_mask - attention_mask_node = self.model.match_parent_path( - start_node, - ["Mul", "Sub", "Cast", "Unsqueeze"], - [1, 0, 1, 0], - output_name_to_node, - ) - - if word_embeddings_node is None or token_type_embeddings_node is None or attention_mask_node is None: - return - - if word_embeddings_node and token_type_embeddings_node and attention_mask_node: - subgraph_nodes = [] - subgraph_nodes.extend(word_embeddings_node) - subgraph_nodes.extend(token_type_embeddings_node) - subgraph_nodes.extend(attention_mask_node) - - subgraph_nodes_unique = [] - for item in subgraph_nodes: - if item not in subgraph_nodes_unique: - subgraph_nodes_unique.append(item) - subgraph_nodes_remove = [] - for item in subgraph_nodes_unique: - if item.op_type != "CustomFCPluginDynamic_IxRT": - subgraph_nodes_remove.append(item) - - # input_ids = self.model.get_graph_inputs_excluding_initializers()[0] - # token_type_ids = self.model.get_graph_inputs_excluding_initializers()[1] - # attention_mask = self.model.get_graph_inputs_excluding_initializers()[2] - - emblayernorm_out = word_embeddings_node[2].output[0] - emblayernorm_out_mask = attention_mask_node[0].output[0] - - beta_data = self.model.get_initializer(word_embeddings_node[2].input[2], True) - embeddings_layernorm_beta_name = "bert_embeddings_layernorm_beta" - embeddings_layernorm_beta = helper.make_tensor( - embeddings_layernorm_beta_name, TensorProto.FLOAT, beta_data.shape, beta_data.flatten().tolist()) - - gamma_data = self.model.get_initializer(word_embeddings_node[2].input[1], True) - embeddings_layernorm_gamma_name = "bert_embeddings_layernorm_gamma" - embeddings_layernorm_gamma = helper.make_tensor( - embeddings_layernorm_gamma_name, TensorProto.FLOAT, gamma_data.shape, gamma_data.flatten().tolist()) - - embeddings_word_embeddings_data = self.model.get_initializer(word_embeddings_node[5].input[0], True) - embeddings_word_embeddings_name = "bert_embeddings_word_embeddings" - embeddings_word_embeddings = helper.make_tensor( - embeddings_word_embeddings_name, TensorProto.FLOAT, embeddings_word_embeddings_data.shape, - embeddings_word_embeddings_data.flatten().tolist()) - - embeddings_token_type_embeddings_data = self.model.get_initializer(token_type_embeddings_node[5].input[0], True) - embeddings_token_type_embeddings_name = "bert_embeddings_token_type_embeddings" - embeddings_token_type_embeddings = helper.make_tensor( - embeddings_token_type_embeddings_name, TensorProto.FLOAT, embeddings_token_type_embeddings_data.shape, - embeddings_token_type_embeddings_data.flatten().tolist()) - - embeddings_position_embeddings_data = self.model.get_initializer(token_type_embeddings_node[3].input[1], True) - embeddings_position_embeddings_name = "bert_embeddings_token_type_embeddings" - embeddings_position_embeddings = helper.make_tensor( - embeddings_position_embeddings_name, TensorProto.FLOAT, embeddings_position_embeddings_data.shape, - embeddings_position_embeddings_data.flatten().tolist()) - - self.model.add_initializer(embeddings_layernorm_beta, self.this_graph_name) - self.model.add_initializer(embeddings_layernorm_gamma, self.this_graph_name) - self.model.add_initializer(embeddings_word_embeddings, self.this_graph_name) - self.model.add_initializer(embeddings_token_type_embeddings, self.this_graph_name) - self.model.add_initializer(embeddings_position_embeddings, self.this_graph_name) - - emblayernorm_node = helper.make_node( - "CustomEmbLayerNormPluginDynamic_IxRT", - inputs=[word_embeddings_node[5].input[1], token_type_embeddings_node[5].input[1], attention_mask_node[3].input[0]], - outputs=[emblayernorm_out, emblayernorm_out_mask], - name=self.model.create_node_name( - "BertEmbedLayerNormalization", name_prefix="BertEmbedLayerNormalization" - ), - ) - emblayernorm_node.domain = "com.iluvatar" - emblayernorm_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - emblayernorm_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - emblayernorm_node.attribute.extend([helper.make_attribute("output_fp16", 1)]) - emblayernorm_node.attribute.extend([helper.make_attribute("full_mask", 1)]) - emblayernorm_node.attribute.extend([helper.make_attribute("mha_type_id", 2)]) - emblayernorm_node.attribute.extend([helper.make_attribute("pad_id", 0)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_beta", embeddings_layernorm_beta)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_gamma", embeddings_layernorm_gamma)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_word_embeddings", embeddings_word_embeddings)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_token_type_embeddings", embeddings_token_type_embeddings)]) - emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_position_embeddings", embeddings_position_embeddings)]) - - self.nodes_to_remove.extend(subgraph_nodes_remove) - - self.nodes_to_add.append(emblayernorm_node) - self.node_name_to_graph_name[emblayernorm_node.name] = self.this_graph_name \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py deleted file mode 100644 index 067ff26e4eb51ea0df3ad6b49318179afd3b4177..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py +++ /dev/null @@ -1,420 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger -from typing import Dict, Optional - -from onnx import helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionFastGelu(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "CustomGeluPluginDynamic_IxRT", "Tanh") - - def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict): - if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node): - return - - if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node): - return - - if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node): - return - - def fuse_1( - self, tanh_node, input_name_to_nodes, output_name_to_node - ) -> Optional[bool]: - """ - Fuse Gelu with tanh into one node: - +---------------------------+ - | | - | v - [root] --> Pow --> Mul -----> Add --> Mul --> Tanh --> Add --> Mul - | (Y=3) (B=0.0447...) (B=0.7978...) (B=1) ^ - | | - +------> Mul(B=0.5)--------------------------------------------+ - Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. - """ - if tanh_node.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[tanh_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_tanh = children[0] - - if not self.model.has_constant_input(add_after_tanh, 1.0): - return - - if add_after_tanh.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_tanh.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_after_tanh = children[0] - - mul_half = self.model.match_parent( - mul_after_tanh, "Mul", None, output_name_to_node - ) - if mul_half is None: - return - - i = self.model.find_constant_input(mul_half, 0.5) - if i < 0: - return - - root_input = mul_half.input[0 if i == 1 else 1] - - # root_node could be None when root_input is graph input - root_node = self.model.get_parent( - mul_half, 0 if i == 1 else 1, output_name_to_node - ) - - mul_before_tanh = self.model.match_parent( - tanh_node, "Mul", 0, output_name_to_node - ) - if mul_before_tanh is None: - return - - i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001) - if i < 0: - return - - add_before_tanh = self.model.match_parent( - mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node - ) - if add_before_tanh is None: - return - - mul_after_pow = self.model.match_parent( - add_before_tanh, - "Mul", - None, - output_name_to_node, - exclude=[root_node] if root_node else [], - ) - if mul_after_pow is None: - return - - i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001) - if i < 0: - return - - pow = self.model.match_parent( - mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node - ) - if pow is None: - return - - if not self.model.has_constant_input(pow, 3.0): - return - - if pow.input[0] != root_input: - return - - subgraph_nodes = [ - mul_after_tanh, - mul_half, - add_after_tanh, - tanh_node, - mul_before_tanh, - add_before_tanh, - mul_after_pow, - pow, - ] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - [mul_after_tanh.output[0]], - input_name_to_nodes, - output_name_to_node, - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "CustomGeluPluginDynamic_IxRT", - inputs=[root_input], - outputs=mul_after_tanh.output, - name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True - - def fuse_2( - self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict - ) -> Optional[bool]: - """ - This pattern is from Tensorflow model. - Fuse Gelu with tanh into one node: - +---------------------------+ - | | - | v - [root] --> Pow --> Mul -----> Add --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul--> - | (Y=3) (B=0.0447...) (B=0.7978...) (B=1) ^ - | | - +---------------------------------------------------------------------------+ - Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. - """ - if tanh_node.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[tanh_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_tanh = children[0] - - if not self.model.has_constant_input(add_after_tanh, 1.0): - return - - if add_after_tanh.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_tanh.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_half = children[0] - - i = self.model.find_constant_input(mul_half, 0.5) - if i < 0: - return - - if mul_half.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[mul_half.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_after_mul_half = children[0] - - root_node = self.model.get_parent( - mul_after_mul_half, - 0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1, - output_name_to_node, - ) - if root_node is None: - return - - mul_before_tanh = self.model.match_parent( - tanh_node, "Mul", 0, output_name_to_node - ) - if mul_before_tanh is None: - return - - i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001) - if i < 0: - return - - add_before_tanh = self.model.match_parent( - mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node - ) - if add_before_tanh is None: - return - - mul_after_pow = self.model.match_parent( - add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node] - ) - if mul_after_pow is None: - return - - i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001) - if i < 0: - return - - pow = self.model.match_parent( - mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node - ) - if pow is None: - return - - if not self.model.has_constant_input(pow, 3.0): - return - - if pow.input[0] != root_node.output[0]: - return - - subgraph_nodes = [ - mul_after_mul_half, - mul_half, - add_after_tanh, - tanh_node, - mul_before_tanh, - add_before_tanh, - mul_after_pow, - pow, - ] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - [mul_after_mul_half.output[0]], - input_name_to_nodes, - output_name_to_node, - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "CustomGeluPluginDynamic_IxRT", - inputs=[root_node.output[0]], - outputs=mul_after_mul_half.output, - name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True - - def fuse_3( - self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict - ) -> Optional[bool]: - """ - OpenAI's gelu implementation, also used in Megatron: - Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x))) - - Fuse subgraph into a FastGelu node: - +------------ Mul (B=0.79788456) -------------------+ - | | - +-------------------------------+ | - | | | - | v v - [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul--> - | ^ - | | - +-----------> Mul (B=0.5) --------------------------------------------------------+ - """ - if tanh_node.output[0] not in input_name_to_nodes: - return - - children = input_name_to_nodes[tanh_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_tanh = children[0] - - if not self.model.has_constant_input(add_after_tanh, 1.0): - return - - if add_after_tanh.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_tanh.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_last = children[0] - - mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node) - if mul_half is None: - return - - i = self.model.find_constant_input(mul_half, 0.5) - if i < 0: - return - - root_input = mul_half.input[0 if i == 1 else 1] - - mul_before_tanh = self.model.match_parent( - tanh_node, "Mul", 0, output_name_to_node - ) - if mul_before_tanh is None: - return - - add_1 = self.model.match_parent( - mul_before_tanh, "Add", None, output_name_to_node - ) - if add_1 is None: - return - j = self.model.find_constant_input(add_1, 1.0) - if j < 0: - return - - mul_7978 = self.model.match_parent( - mul_before_tanh, "Mul", None, output_name_to_node - ) - if mul_7978 is None: - return - k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001) - if k < 0: - return - if mul_7978.input[0 if k == 1 else 1] != root_input: - return - - mul_before_add_1 = self.model.match_parent( - add_1, "Mul", 0 if j == 1 else 1, output_name_to_node - ) - if mul_before_add_1 is None: - return - - if mul_before_add_1.input[0] == root_input: - another = 1 - elif mul_before_add_1.input[1] == root_input: - another = 0 - else: - return - - mul_0447 = self.model.match_parent( - mul_before_add_1, "Mul", another, output_name_to_node - ) - if mul_0447 is None: - return - m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001) - if m < 0: - return - - if mul_0447.input[0 if m == 1 else 1] != root_input: - return - - subgraph_nodes = [ - mul_0447, - mul_before_add_1, - add_1, - mul_before_tanh, - tanh_node, - add_after_tanh, - mul_7978, - mul_half, - mul_last, - ] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - [mul_last.output[0]], - input_name_to_nodes, - output_name_to_node, - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "CustomGeluPluginDynamic_IxRT", - inputs=[root_input], - outputs=mul_last.output, - name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py deleted file mode 100644 index 1f60ab7628f1d700042cf1e025df5bb22fc1d641..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionRemoveUselessElementwise(Fusion): - """ - Fusion to remove useless elementwise in roformer model. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__(model, "Sqrt", "Sqrt") - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - paths = { - "path1": ( - ["Max", "Min", "Add", "GlobalAveragePool"], - [None, None, None, None], - ), - } - - pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths) - - if pool_nodes is None: - logger.debug("GlobalAveragePool: failed searching path after pool node.") - return - - max_node = pool_nodes[0] - min_node = pool_nodes[1] - add_node = pool_nodes[2] - pool_node = pool_nodes[3] - if not self.model.has_constant_input(add_node, 9.999999960041972e-13): - return - - if not self.model.has_constant_input(max_node, 0): - return - - max_node.input[0] = pool_node.output[0] - self.nodes_to_remove.extend([min_node, add_node]) - - -class FusionFormatInvalidMask(Fusion): - """ - Fusion to format invalid mask in roformer model. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__(model, "", ["Greater"]) - - def fuse(self, start_node, input_name_to_nodes, output_name_to_node): - nodes = self.model.match_parent_path( - start_node, - [ - "ReduceMin", - "Cast", - "Concat", - "Unsqueeze", - "Greater", - "ReduceMin", - "Cast", - "Concat", - "Unsqueeze", - ], - [0, 0, 0, 0, 0, 0, 0, 0, 0], - ) - - if nodes is None: - logger.debug("Roformer: unable to format the mask.") - return - - unsqueeze_node = nodes[-1] - - for node in self.model.graph().node: - for (id, input) in enumerate(node.input): - if start_node.output[0] == input: - node.input[id] = unsqueeze_node.input[0] - - self.nodes_to_remove.extend(nodes) - self.nodes_to_remove.extend([start_node]) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py deleted file mode 100644 index 714212664e452ad7a42daa3623185d973e4bb773..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger -from typing import Dict, Optional - -from onnx import helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionGelu(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "Gelu", "Erf") - - def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict): - if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node): - return - if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node): - return - if self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node): - return - self.fuse_4(erf_node, input_name_to_nodes, output_name_to_node) - - def fuse_1( - self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict - ) -> Optional[bool]: - """ - This pattern is from PyTorch model - Fuse Gelu with Erf into one node: - Pattern 1: - +-------Mul(0.5)---------------------+ - | | - | v - [root] --> Div -----> Erf --> Add --> Mul --> - (B=1.4142...) (1) - - Pattern 2: - +------------------------------------+ - | | - | v - [root] --> Div -----> Erf --> Add --> Mul -->Mul --> - (B=1.4142...) (1) (0.5) - - Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. - """ - if erf_node.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[erf_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_erf = children[0] - - if not self.model.has_constant_input(add_after_erf, 1): - return - - if add_after_erf.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_after_erf = children[0] - - div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node) - if div is None: - return - - if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1: - return - - subgraph_input = div.input[0] - - another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0 - if subgraph_input == mul_after_erf.input[another]: # pattern 2 - children = input_name_to_nodes[mul_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_half = children[0] - if not self.model.has_constant_input(mul_half, 0.5): - return - subgraph_output = mul_half.output[0] - else: # pattern 1 - mul_half = self.model.match_parent( - mul_after_erf, "Mul", another, output_name_to_node - ) - if mul_half is None: - return - - if not self.model.has_constant_input(mul_half, 0.5): - return - - if subgraph_input not in mul_half.input: - return - - subgraph_output = mul_after_erf.output[0] - - subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "Gelu", inputs=[subgraph_input], outputs=[subgraph_output] - ) - fused_node.domain = "com.microsoft" - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True - - def fuse_2( - self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict - ) -> Optional[bool]: - """ - This pattern is from Keras model - Fuse Gelu with Erf into one node: - +------------------------------------------+ - | | - | v - [root] --> Div -----> Erf --> Add --> Mul -->Mul - (B=1.4142...) (A=1) (A=0.5) - - Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. - """ - if erf_node.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[erf_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_erf = children[0] - - if not self.model.has_constant_input(add_after_erf, 1): - return - - if add_after_erf.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_after_erf = children[0] - - if not self.model.has_constant_input(mul_after_erf, 0.5): - return - - if mul_after_erf.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[mul_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul = children[0] - - div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node) - if div is None: - return - - sqrt_node = None - if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1: - sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node) - if sqrt_node is None: - return - if not self.model.has_constant_input(sqrt_node, 2.0): - return - - root_node = self.model.get_parent(div, 0, output_name_to_node) - if root_node is None: - return - - if root_node.output[0] not in mul.input: - return - - subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul] - if sqrt_node: - subgraph_nodes.append(sqrt_node) - - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]] - ) - fused_node.domain = "com.microsoft" - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True - - def fuse_3( - self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict - ) -> Optional[bool]: - """ - This pattern is from TensorFlow model - Fuse Gelu with Erf into one node: - +----------------------------------------------+ - | | - | v - [root] --> Mul -----> Erf --> Add --> Mul -->Mul - (A=0.7071067690849304) (B=1) (B=0.5) - - Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. - """ - - if erf_node.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[erf_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_erf = children[0] - - if not self.model.has_constant_input(add_after_erf, 1): - return - - if add_after_erf.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_half = children[0] - - if not self.model.has_constant_input(mul_half, 0.5): - return - - first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node) - if first_mul is None: - return - - i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001) - if i < 0: - return - - root_node = self.model.get_parent( - first_mul, 0 if i == 1 else 1, output_name_to_node - ) - if root_node is None: - return - - if mul_half.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[mul_half.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - last_mul = children[0] - - if not ( - last_mul.input[0] == root_node.output[0] - or last_mul.input[1] == root_node.output[0] - ): - return - - subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - [last_mul.output[0]], - input_name_to_nodes, - output_name_to_node, - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]] - ) - fused_node.domain = "com.microsoft" - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True - - def fuse_4( - self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict - ) -> Optional[bool]: - """ - This pattern is from TensorFlow model - Fuse Gelu with Erf into one node: - Pattern 1: - +-------Mul(0.5)---------------------+ - | | - | v - [root] --> Mul -----> Erf --> Add --> Mul --> - (B=0.7071...) (1) - - Pattern 2: - +------------------------------------+ - | | - | v - [root] --> Mul -----> Erf --> Add --> Mul -->Mul --> - (B=0.7071...) (1) (0.5) - - Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. - """ - if erf_node.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[erf_node.output[0]] - if len(children) != 1 or children[0].op_type != "Add": - return - add_after_erf = children[0] - - if not self.model.has_constant_input(add_after_erf, 1): - return - - if add_after_erf.output[0] not in input_name_to_nodes: - return - children = input_name_to_nodes[add_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_after_erf = children[0] - - mul_before_erf = self.model.match_parent( - erf_node, "Mul", 0, output_name_to_node - ) - if mul_before_erf is None: - return - - if self.model.find_constant_input(mul_before_erf, 0.7071, delta=0.001) != 1: - return - - subgraph_input = mul_before_erf.input[0] - - another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0 - if subgraph_input == mul_after_erf.input[another]: # pattern 2 - children = input_name_to_nodes[mul_after_erf.output[0]] - if len(children) != 1 or children[0].op_type != "Mul": - return - mul_half = children[0] - if not self.model.has_constant_input(mul_half, 0.5): - return - subgraph_output = mul_half.output[0] - else: # pattern 1 - mul_half = self.model.match_parent( - mul_after_erf, "Mul", another, output_name_to_node - ) - if mul_half is None: - return - - if not self.model.has_constant_input(mul_half, 0.5): - return - - if subgraph_input not in mul_half.input: - return - - subgraph_output = mul_after_erf.output[0] - - subgraph_nodes = [ - mul_before_erf, - erf_node, - add_after_erf, - mul_after_erf, - mul_half, - ] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - fused_node = helper.make_node( - "Gelu", inputs=[subgraph_input], outputs=[subgraph_output] - ) - fused_node.domain = "com.microsoft" - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - return True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py deleted file mode 100644 index a89e558cb76aa8208e4a19983f038e9f3584ffdb..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger - -from onnx import helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - - -class FusionGeluApproximation(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation") - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - new_node = helper.make_node( - "FastGelu", - inputs=node.input, - outputs=node.output, - name=self.model.create_node_name( - "FastGelu", node.op_type + "_Approximation" - ), - ) - new_node.domain = "com.microsoft" - self.nodes_to_remove.append(node) - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py deleted file mode 100644 index 805cd3bf7dfbf337a633eaa583d14833cdf86282..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py +++ /dev/null @@ -1,528 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger - -import numpy as np -from onnx import TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionGptAttentionPastBase(Fusion): - """Base class for GPT Attention Fusion with past state""" - - def __init__(self, model: OnnxModel, num_heads: int): - super().__init__(model, "Attention", "LayerNormalization", "with past") - self.num_heads = num_heads - self.utils = FusionUtils(model) - self.casted_attention_mask = ( - {} - ) # map from name of attention mask to the name that casted to int32 - - def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node): - # Pattern 1: - # {past} - # / \ - # / \ - # Gather(axes=0, indices=0) Gather(indices=1) - # | | - # Transpose (perm=0,1,3,2) | - # | | - # Concat_k Concat_v - # | / - # Transpose (perm=0,1,3,2) / - # | / - # Unsqueeze Unsqueeze - # \ / - # \ / - # Concat - # | - # {present} - gather = self.model.get_parent(concat_v, 0, output_name_to_node) - if gather.op_type != "Gather": - logger.debug("match_past_pattern_1: expect Gather for past") - return None - - if not self.model.find_constant_input(gather, 1) == 1: - logger.debug("match_past_pattern_1: expect indices=1 for Gather of past") - return None - past = gather.input[0] - - parent = self.model.get_parent(concat_k, 0, output_name_to_node) - if parent.op_type == "Gather": - gather_past_k = parent - else: - past_k_nodes = self.model.match_parent_path( - concat_k, ["Transpose", "Gather"], [0, 0] - ) - if past_k_nodes is None: - logger.debug("match_past_pattern_1: failed match Transpose and Gather") - return None - gather_past_k = past_k_nodes[-1] - - if not self.model.find_constant_input(gather_past_k, 0) == 1: - logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past") - return None - past_k = gather_past_k.input[0] - if past != past_k: - logger.debug("match_past_pattern_1: expect past to be same") - return None - - return past - - def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node): - # Pattern 2: - # Split (QKV) - # / | | - # / | +----------------------+ - # | | - # | {past} | - # | | | - # Reshape Split Reshape - # | / \ | - # Transpose_k Squeeze Squeeze Transpose_v - # | | \ / - # +------|---+ \ / - # | | \ / - # Concat_k Concat_v - # | | - # Unsqueeze Unsqueeze - # \ / - # Concat - # | - # {present} - # - squeeze = self.model.get_parent(concat_v, 0, output_name_to_node) - if squeeze.op_type != "Squeeze": - logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v") - return None - - split = self.model.get_parent(squeeze, 0, output_name_to_node) - if split.op_type != "Split": - logger.debug("match_past_pattern_2: expect Split for past path") - return None - - opset_version = self.model.get_opset_version() - if opset_version < 13: - if not FusionUtils.check_node_attribute(squeeze, "axes", [0]): - logger.debug( - "match_past_pattern_2: axes != [0] for Squeeze in past path" - ) - return None - - if not FusionUtils.check_node_attribute(split, "split", [1, 1]): - logger.debug( - "match_past_pattern_2: split != [1, 1] for Split in past path" - ) - return None - else: - if not self.utils.check_node_input_value(squeeze, 1, [0]): - logger.debug( - "match_past_pattern_2: axes != [0] for Squeeze in past path" - ) - return None - - if not self.utils.check_node_input_value(split, 1, [1, 1]): - logger.debug( - "match_past_pattern_2: split != [1, 1] for Split in past path" - ) - return None - - if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0): - logger.debug( - "match_past_pattern_2: attribute axis of Split are not expected in past path" - ) - return None - past = split.input[0] - - past_k_nodes = self.model.match_parent_path( - concat_k, ["Squeeze", "Split"], [0, 0] - ) - if past_k_nodes is None: - logger.debug("match_past_pattern_2: failed to match past_k_nodes path") - return None - past_k = past_k_nodes[-1].input[0] - - if past != past_k: - logger.info("match_past_pattern_2: expect past to be same") - return None - - return past - - def match_present(self, concat_v, input_name_to_nodes): - unsqueeze_present_v = self.model.find_first_child_by_type( - concat_v, "Unsqueeze", input_name_to_nodes, recursive=False - ) - if not unsqueeze_present_v: - logger.info("expect unsqueeze for present") - return None - concat_present = self.model.find_first_child_by_type( - unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False - ) - if not concat_present: - logger.info("expect concat for present") - return None - - present = concat_present.output[0] - return present - - def cast_attention_mask(self, input_name): - if input_name in self.casted_attention_mask: - attention_mask_input_name = self.casted_attention_mask[input_name] - elif self.model.find_graph_input(input_name): - casted, attention_mask_input_name = self.utils.cast_graph_input_to_int32( - input_name - ) - self.casted_attention_mask[input_name] = attention_mask_input_name - else: - attention_mask_input_name, cast_node = self.utils.cast_input_to_int32( - input_name - ) - self.casted_attention_mask[input_name] = attention_mask_input_name - return attention_mask_input_name - - -class FusionGptAttention(FusionGptAttentionPastBase): - """ - Fuse GPT-2 Attention with past state subgraph into one Attention node. - """ - - def __init__(self, model: OnnxModel, num_heads: int): - super().__init__(model, num_heads) - - def create_attention_node( - self, - fc_weight, - fc_bias, - gemm_qkv, - past, - present, - input, - output, - mask, - is_unidirectional, - ): - attention_node_name = self.model.create_node_name("GptAttention") - attention_node = helper.make_node( - "Attention", - inputs=[input, fc_weight, fc_bias, mask, past], - outputs=[attention_node_name + "_output", present], - name=attention_node_name, - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [ - helper.make_attribute("num_heads", self.num_heads), - helper.make_attribute("unidirectional", 1 if is_unidirectional else 0), - ] - ) - - matmul_node = helper.make_node( - "MatMul", - inputs=[attention_node_name + "_output", gemm_qkv.input[1]], - outputs=[attention_node_name + "_matmul_output"], - name=attention_node_name + "_matmul", - ) - - add_node = helper.make_node( - "Add", - inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]], - outputs=[output], - name=attention_node_name + "_add", - ) - self.nodes_to_add.extend([attention_node, matmul_node, add_node]) - self.node_name_to_graph_name[attention_node.name] = self.this_graph_name - self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name - self.node_name_to_graph_name[add_node.name] = self.this_graph_name - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - past = None - present = None - return_indice = [] - qkv_nodes = self.model.match_parent_path( - normalize_node, - ["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"], - [0, None, 0, 0, 0, 0, 0], - output_name_to_node=output_name_to_node, - return_indice=return_indice, - ) # yapf: disable - if qkv_nodes is None: - return - ( - add_qkv, - reshape_qkv, - gemm_qkv, - reshape_1, - reshape_2, - transpose_qkv, - matmul_qkv, - ) = qkv_nodes - - another_input = add_qkv.input[1 - return_indice[0]] - - v_nodes = self.model.match_parent_path( - matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0] - ) - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - (concat_v, transpose_v, reshape_v, split_fc) = v_nodes - - fc_nodes = self.model.match_parent_path( - split_fc, - ["Reshape", "Gemm", "Reshape", "LayerNormalization"], - [0, 0, 0, 0], - output_name_to_node, - ) - if fc_nodes is None: - fc_nodes = self.model.match_parent_path( - split_fc, - ["Add", "MatMul", "LayerNormalization"], - [0, None, 0], - output_name_to_node, - ) - if fc_nodes is None: - logger.debug("fuse_attention: failed to match fc path") - return - fc_weight = fc_nodes[1].input[1] - i, _ = self.model.get_constant_input(fc_nodes[0]) - fc_bias = fc_nodes[0].input[i] - else: - fc_weight = fc_nodes[1].input[1] - fc_bias = fc_nodes[1].input[2] - - layernorm_before_attention = fc_nodes[-1] - - if not another_input in layernorm_before_attention.input: - logger.debug("Add and LayerNormalization shall have one same input") - return - - is_unidirectional = True - slice_mask = None - input_mask_nodes = None - concat_k_to_match = None - qk_nodes = self.model.match_parent_path( - matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0] - ) - if qk_nodes is not None: - (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes - mask_nodes = self.model.match_parent_path( - sub_qk, - [ - "Mul", - "Sub", - "Slice", - "Slice", - "Unsqueeze", - "Sub", - "Squeeze", - "Slice", - "Shape", - "Div", - ], - [1, 0, 1, 0, 1, 0, 0, 0, 0, 0], - ) # yapf: disable - if mask_nodes is None: - logger.debug("fuse_attention: failed to match unidirectional mask path") - return - div_mask = mask_nodes[-1] - slice_mask = mask_nodes[3] - - if div_qk != div_mask: - logger.debug("fuse_attention: skip since div_qk != div_mask") - return - else: - # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0. - i, qk_nodes, _ = self.model.match_parent_paths( - matmul_qkv, - [ - (["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]), - (["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]), - ], - output_name_to_node, - ) - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk nodes") - return - - where_qk = qk_nodes[-3] - div_qk = qk_nodes[-2] - matmul_qk = qk_nodes[-1] - - if i == 1: - add_qk = qk_nodes[1] - _, input_mask_nodes, _ = self.model.match_parent_paths( - add_qk, - [ - ( - ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"], - [None, 0, 1, 0, 0, 0], - ), - ( - ["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"], - [None, 0, 1, 0, 0], - ), - ( - ["Mul", "Sub", "Unsqueeze", "Unsqueeze"], - [None, 0, 1, 0], - ), # useless cast and reshape are removed. - ], - output_name_to_node, - ) # yapf: disable - if input_mask_nodes is None: - logger.debug( - "fuse_attention: failed to match input attention mask path" - ) - return - - mask_nodes = self.model.match_parent_path( - where_qk, - [ - "Cast", - "Slice", - "Slice", - "Unsqueeze", - "Sub", - "Squeeze", - "Slice", - "Shape", - ], - [0, 0, 0, 1, 0, 0, 0, 0], - output_name_to_node, - ) # yapf: disable - if mask_nodes is None: - # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep. - logger.debug("fuse_attention: failed to match mask path") - return - - slice_mask = mask_nodes[2] - - div_or_concat = self.model.get_parent( - mask_nodes[-1], 0, output_name_to_node - ) - if div_or_concat.op_type == "Div": - div_mask = div_or_concat - if div_qk != div_mask: - logger.debug("fuse_attention: skip since div_qk != div_mask") - return - elif div_or_concat.op_type == "Concat": - concat_k_to_match = div_or_concat - else: - logger.debug("fuse_attention: failed to match mask path") - - # Validate that the mask data is either lower triangular (unidirectional) or all ones - mask_data = numpy_helper.to_array( - self.model.get_initializer(slice_mask.input[0]) - ) - if not ( - len(mask_data.shape) == 4 - and mask_data.shape[:2] == (1, 1) - and mask_data.shape[2] == mask_data.shape[3] - ): - logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW") - return - if np.allclose(mask_data, np.ones_like(mask_data)): - is_unidirectional = False - elif not np.allclose(mask_data, np.tril(np.ones_like(mask_data))): - logger.debug( - "fuse_attention: skip since mask is neither lower triangular nor ones" - ) - return - - q_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0] - ) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - (transpose_q, reshape_q, split_q) = q_nodes - if split_fc != split_q: - logger.debug("fuse_attention: skip since split_fc != split_q") - return - - k_nodes = self.model.match_parent_path( - matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0] - ) - if k_nodes is None: - # This pattern is from pytorch 1.7.1 and transformers 4.6.1 - k_nodes = self.model.match_parent_path( - matmul_qk, - ["Transpose", "Concat", "Transpose", "Reshape", "Split"], - [1, 0, 1, 0, 0], - ) - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - else: - (_, concat_k, transpose_k, reshape_k, split_k) = k_nodes - else: - (concat_k, transpose_k, reshape_k, split_k) = k_nodes - if split_fc != split_k: - logger.debug("fuse_attention: skip since split_fc != split_k") - return - - if concat_k_to_match and concat_k != concat_k_to_match: - logger.debug("fuse_attention: skip since concat_k != concat_k_to_match") - return - - attention_mask_input_name = "" - if input_mask_nodes is not None: - input_name = input_mask_nodes[-1].input[0] - attention_mask_input_name = self.cast_attention_mask(input_name) - - # Match past and present paths - past = self.match_past_pattern_1( - concat_k, concat_v, output_name_to_node - ) or self.match_past_pattern_2(concat_k, concat_v, output_name_to_node) - if past is None: - logger.info("fuse_attention: failed to match past path") - return - if not self.model.find_graph_input(past): - logger.debug("past is not graph input.") - # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input. - - present = self.match_present(concat_v, input_name_to_nodes) - if present is None: - logger.info("fuse_attention: failed to match present path") - return - if not self.model.find_graph_output(present): - logger.info("expect present to be graph output") - return - - self.create_attention_node( - fc_weight, - fc_bias, - gemm_qkv, - past, - present, - layernorm_before_attention.output[0], - reshape_qkv.output[0], - attention_mask_input_name, - is_unidirectional, - ) - - # we rely on prune_graph() to clean old subgraph nodes: - # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv] - self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py deleted file mode 100644 index 138a9c5ff495d59830ec0c7761a674d7beacb834..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger - -import numpy as np -from onnx import TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_gpt_attention import FusionGptAttentionPastBase -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -def is_close(value, expected_value): - return abs(value - expected_value) <= 1e-6 - - -class FusionGptAttentionMegatron(FusionGptAttentionPastBase): - """ - Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node. - """ - - def __init__(self, model: OnnxModel, num_heads: int): - super().__init__(model, num_heads) - - def fuse_attention_node( - self, - matmul_before_split, - add_before_split, - past, - present, - input, - reshape_qkv, - mask, - ): - attention_node_name = self.model.create_node_name("GptAttention") - int32_mask = self.cast_attention_mask(mask) - output = reshape_qkv.output[0] - i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0 - attention_node = helper.make_node( - "Attention", - inputs=[ - input, - matmul_before_split.input[1], - add_before_split.input[i], - int32_mask, - past, - ], - outputs=[output, present], - name=attention_node_name, - ) - attention_node.domain = "com.microsoft" - attention_node.attribute.extend( - [ - helper.make_attribute("num_heads", self.num_heads), - helper.make_attribute( - "unidirectional", 0 - ), # unidirectional shall not be ON for 4D attention mask - ] - ) - - nodes_to_add = [attention_node] - self.nodes_to_add.extend(nodes_to_add) - - for node in nodes_to_add: - self.node_name_to_graph_name[node.name] = self.this_graph_name - - self.nodes_to_remove.append(reshape_qkv) - - # we rely on prune_graph() to clean old subgraph nodes - self.prune_graph = True - - def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention): - mask_nodes = self.model.match_parent_path( - sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0] - ) # yapf: disable - if mask_nodes is None: - logger.debug("fuse_attention: failed to match unidirectional mask path") - return None - (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes - - if mul_qk.input[1] != last_slice_mask.output[0]: - logger.debug( - "fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]" - ) - return None - - if not self.utils.check_node_input_value(mul_mask, 1, 10000.0): - logger.debug( - "fuse_attention failed: mul_mask input 1 is not constant 10000.0" - ) - return None - - if not self.utils.check_node_input_value(sub_mask, 0, 1.0): - logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0") - return None - - if not self.model.find_graph_input(slice_mask.input[0]): - logger.info("expect slick_mask input 0 to be graph input") - return None - - if not self.utils.check_node_input_value(last_slice_mask, 1, [0]): - logger.debug( - "fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]" - ) - return None - - if not self.utils.check_node_input_value(last_slice_mask, 3, [3]): - logger.debug( - "fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]" - ) - return False - - if not self.utils.check_node_input_value(last_slice_mask, 4, [1]): - logger.debug( - "fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]" - ) - return False - - if not self.utils.check_node_input_value(slice_mask, 3, [2]): - logger.debug( - "fuse_attention failed: slice_mask input 3 (axes) is not constant [2]" - ) - return None - - if not self.utils.check_node_input_value(slice_mask, 4, [1]): - logger.debug( - "fuse_attention failed: slice_mask input 4 (steps) is not constant [1]" - ) - return None - - last_slice_path = self.model.match_parent_path( - last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0] - ) - if last_slice_path is None or last_slice_path[-1] != matmul_qk: - logger.debug("fuse_attention: failed to match last slice path") - return None - - first_slice_path = self.model.match_parent_path( - slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0] - ) - if first_slice_path is None or first_slice_path[-1] != matmul_qk: - logger.debug("fuse_attention: failed to match first slice path") - return None - - first_slice_sub = self.model.match_parent_path( - slice_mask, - ["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"], - [1, 0, 0, 0, 0], - ) - if first_slice_sub is None or first_slice_sub[-1] != matmul_qk: - logger.debug("fuse_attention: failed to match last slice sub path") - return None - - first_slice_sub_1 = self.model.match_parent_path( - slice_mask, - ["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"], - [1, 0, 1, 0, 0], - ) - if ( - first_slice_sub_1 is None - or first_slice_sub_1[-1] != layernorm_before_attention - ): - logger.debug("fuse_attention: failed to match last slice sub path 1") - return None - - return slice_mask.input[0] - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - past = None - present = None - - qkv_nodes = self.model.match_parent_path( - normalize_node, - ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [0, 1, None, 0, 0, 0], - output_name_to_node=output_name_to_node, - ) # yapf: disable - if qkv_nodes is None: - return - ( - add_skip, - add_after_attention, - matmul_after_attention, - reshape_qkv, - transpose_qkv, - matmul_qkv, - ) = qkv_nodes - - skip_input = add_skip.input[0] - - v_nodes = self.model.match_parent_path( - matmul_qkv, - [ - "Concat", - "Transpose", - "Reshape", - "Split", - "Add", - "MatMul", - "LayerNormalization", - ], - [1, 1, 0, 0, 0, None, 0], - ) # yapf: disable - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - ( - concat_v, - transpose_v, - reshape_v, - split_v, - add_before_split, - matmul_before_split, - layernorm_before_attention, - ) = v_nodes - if skip_input != layernorm_before_attention.input[0]: - logger.debug( - "fuse_attention: skip_input != layernorm_before_attention.input[0]" - ) - return - - qk_nodes = self.model.match_parent_path( - matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0] - ) - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return None - (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes - if self.model.get_node_attribute(softmax_qk, "axis") != 3: - logger.debug("fuse_attention failed: softmax_qk axis != 3") - return None - - attention_mask = self.match_mask( - sub_qk, mul_qk, matmul_qk, layernorm_before_attention - ) - - q_nodes = self.model.match_parent_path( - matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0] - ) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - (div_q, transpose_q, reshape_q, split_q) = q_nodes - if split_v != split_q: - logger.debug("fuse_attention: skip since split_v != split_q") - return - - k_nodes = self.model.match_parent_path( - matmul_qk, - ["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"], - [1, 0, 0, 1, 0, 0], - ) - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes - if split_v != split_k: - logger.debug("fuse_attention: skip since split_v != split_k") - return - - i, value = self.model.get_constant_input(reshape_k) - if not ( - isinstance(value, np.ndarray) - and list(value.shape) == [4] - and value[0] == 0 - and value[1] == 0 - and value[2] > 0 - and value[3] > 0 - ): - logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]") - return - - num_heads = value[2] - if num_heads != self.num_heads: - logger.info( - f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}" - ) - self.num_heads = num_heads - - hidden_size_per_head = value[3] - i, value = self.model.get_constant_input(div_k) - expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head))) - if not is_close(value, expected_value): - logger.debug( - f"fuse_attention: div_k value={value} expected={expected_value}" - ) - return - - i, value = self.model.get_constant_input(div_q) - if not is_close(value, expected_value): - logger.debug( - f"fuse_attention: div_q value={value} expected={expected_value}" - ) - return - - # Match past and present paths - past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node) - if past is None: - logger.debug("fuse_attention: match past failed") - return - if not self.model.find_graph_input(past): - logger.debug("fuse_attention: past is not graph input.") - # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input. - - present = self.match_present(concat_v, input_name_to_nodes) - if present is None: - logger.debug("fuse_attention: match present failed") - return - if not self.model.find_graph_output(present): - logger.info("fuse_attention: expect present to be graph output") - return - - self.fuse_attention_node( - matmul_before_split, - add_before_split, - past, - present, - layernorm_before_attention.output[0], - reshape_qkv, - attention_mask, - ) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py deleted file mode 100644 index 4e538cf5833d096635e461eae34ab35edd20d3b1..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py +++ /dev/null @@ -1,273 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from logging import getLogger -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionGptAttentionNoPast(Fusion): - """ - Fuse GPT-2 Attention without past state into one Attention node. - This does not support attention_mask graph input right now. - """ - - def __init__(self, model: OnnxModel): - super().__init__( - model, - "CustomQKVToContextPluginDynamic_IxRT", - ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"], - "without past", - ) - self.where_qk_shared = None - - def get_num_heads_and_hidden_size( - self, custom_fc: NodeProto, div: NodeProto - ) -> Tuple[int, int]: - div_initializer = self.model.get_initializer(div.input[1]) - - # 检查float_data是否为空 - if len(div_initializer.float_data) > 0: - div_value = div_initializer.float_data[0] - else: - # 如果float_data为空,尝试其他方式获取数据 - # 例如,如果数据存储在raw_data中 - if len(div_initializer.raw_data) > 0: - dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type] - div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0] - else: - raise ValueError("Data not found in the div_initializer") - - for attr in custom_fc.attribute: - if attr.name == "W": - tensor_value = attr.t - tensor_shape = [dim for dim in tensor_value.dims] - break - head_dim = math.ceil(div_value * div_value) - hidden_size = tensor_shape[1] - num_heads = hidden_size // head_dim - - return num_heads, hidden_size - - def create_attention_node( - self, - num_heads: int, - hidden_size: int, - input: str, - output: str, - where_qk: NodeProto, - ) -> Union[NodeProto, None]: - - attention_node_name = self.model.create_node_name("Attention") - - attention_inputs = [input] - if where_qk is not None: - has_mask = 1 - has_qk_bias = 1 - attention_inputs.append(where_qk.output[0]) - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend( - [helper.make_attribute("has_qk_bias", has_qk_bias)] - ) - return attention_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - return_indice = [] - add_qkv = normalize_node - if normalize_node.op_type == "LayerNormalization": - add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) - if add_before_layernorm is not None: - add_qkv = add_before_layernorm - - qkv_paths = { - "path1": ( - ["CustomFCPluginDynamic_IxRT", "Reshape", "Transpose", "MatMul"], - [None, 0, 0, 0], - ), - "path2": ( - ["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], - [None, 0, 0], - ), - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict( - add_qkv, - qkv_paths, - output_name_to_node, - return_indice, - ) # yapf: disable - - if qkv_nodes is None: - return - reshape_2 = None - if qkv_path == "path1": - ( - custom_fc_after_attention, - reshape_2, - transpose_qkv, - matmul_qkv, - ) = qkv_nodes - else: - ( - custom_fc_after_attention, - transpose_qkv, - matmul_qkv, - ) = qkv_nodes - - another_input = add_qkv.input[1 - return_indice[0]] - - v_nodes = self.model.match_parent_path( - matmul_qkv, - ["Transpose", "Reshape", "Split", "CustomFCPluginDynamic_IxRT"], - [1, 0, 0, 0], - ) # yapf: disable - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - ( - transpose_v, - reshape_v, - split_v, - custom_fc_before_attention, - ) = v_nodes - - layernorm_before_attention = self.model.get_parent( - custom_fc_before_attention, 0, output_name_to_node - ) - if ( - layernorm_before_attention is None - or layernorm_before_attention.op_type != "LayerNormalization" - ): - if layernorm_before_attention.op_type != "Add": - logger.debug( - f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}" - ) - return - - if not another_input in layernorm_before_attention.input: - # match openai-gpt - if not another_input in layernorm_before_attention.output: - logger.debug("Add and LayerNormalization shall have one same input") - return - - qk_nodes = self.model.match_parent_path( - matmul_qkv, ["Softmax", "Add", "Where", "Div", "MatMul"], [0, None, 0, 1, 0] - ) - where_qk = None - matmul_qk = None - mask_return_indices = [] - if qk_nodes is not None: - (softmax_qk, add_qk, where_qk, div_qk, matmul_qk) = qk_nodes - mask_nodes = self.model.match_parent_path( - add_qk, - ["Mul", "Sub", "Cast", "Unsqueeze"], - [None, 0, 1, 0], - return_indice=mask_return_indices, - ) # yapf: disable - if mask_nodes is None: - logger.debug("fuse_attention: failed to match mask path") - return - - q_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0] - ) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - (transpose_q, reshape_q, split_q) = q_nodes - if split_v != split_q: - logger.debug("fuse_attention: skip since split_v != split_q") - return - - k_nodes = self.model.match_parent_path( - matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0] - ) - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - (transpose_k, reshape_k, split_k) = k_nodes - if split_v != split_k: - logger.debug("fuse_attention: skip since split_v != split_k") - return - - if where_qk is None: - return - - global num_heads, hidden_size - if self.where_qk_shared is None: - where_qk.input[1] = mask_nodes[0].output[0] - div_qk.output[0] = where_qk.output[0] - add_qk.input[1 - mask_return_indices[0]] = div_qk.output[0] - self.where_qk_shared = where_qk - self.nodes_to_remove.extend([softmax_qk, add_qk, div_qk, matmul_qk]) - - num_heads, hidden_size = self.get_num_heads_and_hidden_size( - custom_fc_after_attention, div_qk - ) - self.nodes_to_remove.extend([k_nodes[0]]) - self.nodes_to_remove.extend(v_nodes[:-2]) - else: - self.nodes_to_remove.extend( - [softmax_qk, add_qk, where_qk, div_qk, matmul_qk] - ) - self.nodes_to_remove.extend(q_nodes) - self.nodes_to_remove.extend(k_nodes) - self.nodes_to_remove.extend(v_nodes[:-1]) - - new_node = self.create_attention_node( - num_heads, - hidden_size, - custom_fc_before_attention.output[0], - transpose_qkv.output[0] if reshape_2 is None else reshape_2.output[0], - self.where_qk_shared, - ) - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - if reshape_2 is not None: - self.nodes_to_remove.extend([reshape_2]) - self.nodes_to_remove.extend([transpose_qkv, matmul_qkv]) - diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py deleted file mode 100644 index d19c3aff604ed6f3ae673ffa0c67143b66e36aaf..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py +++ /dev/null @@ -1,511 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger -from typing import Dict - -import numpy as np -from onnx import TensorProto, helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionLayerNormalization(Fusion): - def __init__(self, model: OnnxModel, hidden_size): - self.hidden_size = hidden_size - super().__init__(model, "LayerNormalization", "ReduceMean") - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - Fuse Layer Normalization subgraph into one node LayerNormalization: - +----------------------+ - | | - | v - [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add - (axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^ - | | - +-----------------------------------------------+ - - It also handles cases of duplicated sub nodes exported from older version of PyTorch: - +----------------------+ - | v - | +-------> Sub-----------------------------------------------+ - | | | - | | v - [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add - | ^ - | | - +----------------------+ - """ - children = self.model.get_children(node, input_name_to_nodes) - if len(children) == 0 or len(children) > 2: - return - - root_input = node.input[0] - - if children[0].op_type != "Sub" or children[0].input[0] != root_input: - return - - if len(children) == 2: - if children[1].op_type != "Sub" or children[1].input[0] != root_input: - return - - div_node = None - for child in children: - div_node = self.model.find_first_child_by_type( - child, "Div", input_name_to_nodes, recursive=False - ) - if div_node is not None: - break - if div_node is None: - return - - path_id, parent_nodes, _ = self.model.match_parent_paths( - div_node, - [ - (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]), - ( - ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"], - [1, 0, 0, 0, 0, 0], - ), - ], - output_name_to_node, - ) - if path_id < 0: - return - - sub_node = parent_nodes[-1] - if sub_node not in children: - return - - second_add_node = parent_nodes[1] - i, add_weight = self.model.get_constant_input(second_add_node) - if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4: - logger.warning(f"epsilon value is not expeced: {add_weight}") - return - - pow_node = parent_nodes[3] - if not self.model.find_constant_input(pow_node, 2.0) == 1: - return - - mul_node = input_name_to_nodes[div_node.output[0]][0] - is_not_have_mul_and_add = False - is_not_have_mul_and_add_lst_node = None - # deal with special case : layernorm do not have mul and add - if mul_node.op_type != "Mul" and mul_node.op_type == "MatMul": - is_not_have_mul_and_add = True - is_not_have_mul_and_add_lst_node = div_node - elif mul_node.op_type != "Mul": - return - - if is_not_have_mul_and_add: - last_add_node = is_not_have_mul_and_add_lst_node - if self.hidden_size == 0: - print( - "[Error] Please add '--hidden_size' and '--num_head' to fuse layernorm ..." - ) - exit(0) - - subgraph_nodes = [node] - subgraph_nodes.extend(children) - subgraph_nodes.extend(parent_nodes[:-1]) - subgraph_nodes.extend([last_add_node]) - if len(subgraph_nodes) == 7: - self.nodes_to_remove.extend(subgraph_nodes) - else: - return - - norm_name = self.model.create_node_name( - "LayerNormalization", name_prefix="LayerNorm" - ) - np_weights = np.ones((self.hidden_size)).astype(np.float32) - np_weights_name = norm_name + "_weights" - weights_tensor = helper.make_tensor( - np_weights_name, TensorProto.FLOAT, np_weights.shape, np_weights - ) - np_bias = np.zeros((self.hidden_size)).astype(np.float32) - np_bias_name = norm_name + "_bias" - bias_tensor = helper.make_tensor( - np_bias_name, TensorProto.FLOAT, np_bias.shape, np_bias - ) - self.model.add_initializer(weights_tensor) - self.model.add_initializer(bias_tensor) - normalize_node = helper.make_node( - "LayerNormalization", - inputs=[node.input[0], np_weights_name, np_bias_name], - outputs=[last_add_node.output[0]], - name=norm_name, - ) - normalize_node.attribute.extend( - [helper.make_attribute("epsilon", float(add_weight))] - ) - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name - else: - last_add_node = input_name_to_nodes[mul_node.output[0]][0] - if last_add_node.op_type != "Add": - return - - subgraph_nodes = [node] - subgraph_nodes.extend(children) - subgraph_nodes.extend(parent_nodes[:-1]) - - subgraph_nodes.extend([last_add_node, mul_node, div_node]) - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - last_add_node.output, - input_name_to_nodes, - output_name_to_node, - ): - logger.debug(f"It is not safe to fuse LayerNormalization node. Skip") - return - - weight_input = mul_node.input[ - 1 - self.model.input_index(div_node.output[0], mul_node) - ] - if not self.model.is_constant_with_specified_dimension( - weight_input, 1, "layernorm weight" - ): - return - - bias_input = last_add_node.input[ - 1 - self.model.input_index(mul_node.output[0], last_add_node) - ] - if not self.model.is_constant_with_specified_dimension( - bias_input, 1, "layernorm bias" - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - normalize_node = helper.make_node( - "LayerNormalization", - inputs=[node.input[0], weight_input, bias_input], - outputs=[last_add_node.output[0]], - name=self.model.create_node_name( - "LayerNormalization", name_prefix="LayerNorm" - ), - ) - normalize_node.attribute.extend( - [helper.make_attribute("epsilon", float(add_weight))] - ) - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name - - -class FusionLayerNormalizationKeras(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, "LayerNormalization", "GlobalAveragePool", "Keras layernorm" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - +-------------------------------+ - | | - | v - [Root] --> GlobalAveragePool--> Sub --> Mul --> GlobalAveragePool --> Add/Min/Max --> Sqrt --> Div --> Mul --> Add - | ^ - | | - +---------------------------------------------------------------+ - """ - children = self.model.get_children(node, input_name_to_nodes) - # print(len(children)) - if len(children) != 1: - return - - root_input = node.input[0] - - if children[0].op_type != "Sub" or children[0].input[0] != root_input: - return - - div_node = None - for child in children: - div_node = self.model.find_first_child_by_type( - child, "Div", input_name_to_nodes, recursive=False - ) - if div_node is not None: - break - if div_node is None: - return - # print('div_node_name:', div_node.name) - path_id, parent_nodes, _ = self.model.match_parent_paths( - div_node, - [ - ( - ["Sqrt", "Max", "Min", "Add", "GlobalAveragePool", "Mul", "Sub"], - [1, 0, 0, 0, None, 0, None], - ), - ], - output_name_to_node, - ) - if path_id < 0: - return - - sub_node = parent_nodes[-1] - if sub_node not in children: - return - - second_add_node = parent_nodes[3] - i, add_weight = self.model.get_constant_input(second_add_node) - if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4: - logger.warning(f"epsilon value is not expeced: {add_weight}") - return - - mul_node = input_name_to_nodes[div_node.output[0]][0] - if mul_node.op_type != "Mul": - return - - last_add_node = input_name_to_nodes[mul_node.output[0]][0] - if last_add_node.op_type != "Add": - return - - subgraph_nodes = [node] - subgraph_nodes.extend(children) - subgraph_nodes.extend(parent_nodes[:-1]) - - subgraph_nodes.extend([last_add_node, mul_node, div_node]) - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - last_add_node.output, - input_name_to_nodes, - output_name_to_node, - ): - logger.debug(f"It is not safe to fuse LayerNormalization node. Skip") - return - - weight_input = mul_node.input[ - 1 - self.model.input_index(div_node.output[0], mul_node) - ] - if not self.model.is_constant_with_specified_dimension( - weight_input, 1, "layernorm weight" - ): - return - - bias_input = last_add_node.input[ - 1 - self.model.input_index(mul_node.output[0], last_add_node) - ] - if not self.model.is_constant_with_specified_dimension( - bias_input, 1, "layernorm bias" - ): - return - - self.nodes_to_remove.extend(subgraph_nodes) - normalize_node = helper.make_node( - "LayerNormalization", - inputs=[node.input[0], weight_input, bias_input], - outputs=[last_add_node.output[0]], - name=self.model.create_node_name( - "LayerNormalization", name_prefix="LayerNorm" - ), - ) - normalize_node.attribute.extend( - [helper.make_attribute("epsilon", float(add_weight))] - ) - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name - - -class FusionLayerNormalizationTF(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "LayerNormalization", "Add", "TF") - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - Layer Norm from Tensorflow model(using keras2onnx or tf2onnx): - +------------------------------------+ - | | - | | - (Cast_1) | - | | - | v (B) (B) (A) - Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add - | | | ^ ^ - | | | | | - | +--------------------------------------------------(Cast_2)-------------------------------|-------+ | - | v | - +---------------------------------------------------------------------------------------------------------------> Mul--------------------+ - """ - return_indice = [] - _, parent_nodes, return_indice = self.model.match_parent_paths( - node, - [ - ( - [ - "Sub", - "Mul", - "Mul", - "Reciprocal", - "Sqrt", - "Add", - "ReduceMean", - "Mul", - "Sub", - "ReduceMean", - ], - [1, 1, None, 0, 0, 0, None, 0, 0, None], - ), - ( - [ - "Sub", - "Mul", - "Mul", - "Reciprocal", - "Sqrt", - "Add", - "Cast", - "ReduceMean", - "Mul", - "Sub", - "ReduceMean", - ], - [1, 1, None, 0, 0, 0, 0, None, 0, 0, None], - ), - ], - output_name_to_node, - ) # yapf: disable - - if parent_nodes is None: - return - - assert len(return_indice) == 3 - if not ( - return_indice[0] in [0, 1] - and return_indice[1] in [0, 1] - and return_indice[2] in [0, 1] - ): - logger.debug( - "return indice is exepected in [0, 1], but got {return_indice}" - ) - return - - ( - sub_node_0, - mul_node_0, - mul_node_1, - reciprocol_node, - sqrt_node, - add_node_0, - ) = parent_nodes[:6] - reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[ - -4: - ] - - cast_node_3 = None - if len(parent_nodes) == 11: - cast_node_3 = parent_nodes[6] - assert cast_node_3.op_type == "Cast" - - mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node) - if mul_node_3 is None: - logger.debug("mul_node_3 not found") - return - - node_before_reduce = self.model.get_parent( - reduce_mean_node_1, 0, output_name_to_node - ) - root_node = ( - node_before_reduce - if cast_node_3 is None - else self.model.get_parent(node_before_reduce, 0, output_name_to_node) - ) - if root_node is None: - logger.debug("root node is none") - return - - i, epsilon = self.model.get_constant_input(add_node_0) - if ( - epsilon is None - or epsilon <= 0 - or (epsilon > 1.0e-5 and cast_node_3 is None) - ): - logger.debug("epsilon is not matched") - return - - if cast_node_3 is None and ( - reduce_mean_node_1.input[0] not in mul_node_3.input - or reduce_mean_node_1.input[0] not in sub_node_1.input - ): - logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node") - return - - if cast_node_3 is not None and ( - node_before_reduce.input[0] not in mul_node_3.input - or reduce_mean_node_1.input[0] not in sub_node_1.input - ): - logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node") - return - - if mul_node_2.input[0] != mul_node_2.input[1]: - logger.debug("mul_node_2 shall have two same inputs") - return - - subgraph_nodes = [ - node, - sub_node_0, - mul_node_0, - mul_node_1, - reciprocol_node, - sqrt_node, - add_node_0, - reduce_mean_node_0, - mul_node_2, - sub_node_1, - reduce_mean_node_1, - mul_node_3, - ] - - if cast_node_3 is not None: - cast_node_2 = self.model.match_parent( - mul_node_0, "Cast", 0, output_name_to_node - ) - if cast_node_2 is None: - logger.debug("cast_node_2 not found") - return - subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3]) - - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - node.output, - self.model.input_name_to_nodes(), - self.model.output_name_to_node(), - ): - logger.debug("not safe to fuse layer normalization") - return - - self.nodes_to_remove.extend(subgraph_nodes) - - weight_input = mul_node_1.input[1] - bias_input = sub_node_0.input[0] - - # TODO: add epsilon attribute - fused_node = helper.make_node( - "LayerNormalization", - inputs=[mul_node_3.input[0], weight_input, bias_input], - outputs=[node.output[0]], - name=self.model.create_node_name( - "LayerNormalization", name_prefix="LayerNorm" - ), - ) - fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))]) - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py deleted file mode 100644 index c0bb11b3bdd6bcbb994b8ad83501be2d9c1c4505..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from argparse import ArgumentParser - - -class AttentionMaskFormat: - MaskIndexEnd = 0 - MaskIndexEndAndStart = 1 - AttentionMask = 2 - NoMask = 3 - - -class FusionOptions: - """Options of fusion in graph optimization""" - - def __init__(self, model_type): - self.enable_gelu = True - self.enable_layer_norm = True - self.enable_attention = True - self.enable_skip_layer_norm = True - self.enable_embed_layer_norm = True - self.enable_bias_skip_layer_norm = True - self.enable_bias_gelu = True - self.enable_gelu_approximation = False - self.enable_qordered_matmul = True - - self.enable_shape_inference = True - self.enable_swint_opt = False - self.enable_format_roformer = False - self.enable_gpt2_classify = False - self.enable_vit = False - self.enable_omdet = False - self.attention_mask_format = AttentionMaskFormat.AttentionMask - - if model_type == "gpt2": - self.enable_skip_layer_norm = False - self.enable_gpt2_classify = True - elif model_type == "swint": - self.enable_swint_opt = True - elif model_type == "roformer": - self.enable_format_roformer = True - elif model_type == "vit": - self.enable_vit = True - elif model_type == "omdet": - self.enable_omdet = True - - def use_raw_attention_mask(self, use_raw_mask=True): - if use_raw_mask: - self.attention_mask_format = AttentionMaskFormat.AttentionMask - else: - self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd - - def disable_attention_mask(self): - self.attention_mask_format = AttentionMaskFormat.NoMask - - @staticmethod - def parse(args): - options = FusionOptions(args.model_type) - if args.disable_gelu: - options.enable_gelu = False - if args.disable_layer_norm: - options.enable_layer_norm = False - if args.disable_attention: - options.enable_attention = False - if args.disable_skip_layer_norm: - options.enable_skip_layer_norm = False - if args.disable_embed_layer_norm: - options.enable_embed_layer_norm = False - if args.disable_bias_skip_layer_norm: - options.enable_bias_skip_layer_norm = False - if args.disable_bias_gelu: - options.enable_bias_gelu = False - if args.enable_gelu_approximation: - options.enable_gelu_approximation = True - if args.disable_shape_inference: - options.enable_shape_inference = False - if args.use_mask_index: - options.use_raw_attention_mask(False) - if args.no_attention_mask: - options.disable_attention_mask() - return options - - @staticmethod - def add_arguments(parser: ArgumentParser): - parser.add_argument( - "--disable_attention", - required=False, - action="store_true", - help="disable Attention fusion", - ) - parser.set_defaults(disable_attention=False) - - parser.add_argument( - "--disable_skip_layer_norm", - required=False, - action="store_true", - help="disable SkipLayerNormalization fusion", - ) - parser.set_defaults(disable_skip_layer_norm=False) - - parser.add_argument( - "--disable_embed_layer_norm", - required=False, - action="store_true", - help="disable EmbedLayerNormalization fusion", - ) - parser.set_defaults(disable_embed_layer_norm=False) - - parser.add_argument( - "--disable_bias_skip_layer_norm", - required=False, - action="store_true", - help="disable Add Bias and SkipLayerNormalization fusion", - ) - parser.set_defaults(disable_bias_skip_layer_norm=False) - - parser.add_argument( - "--disable_bias_gelu", - required=False, - action="store_true", - help="disable Add Bias and Gelu/FastGelu fusion", - ) - parser.set_defaults(disable_bias_gelu=False) - - parser.add_argument( - "--disable_layer_norm", - required=False, - action="store_true", - help="disable LayerNormalization fusion", - ) - parser.set_defaults(disable_layer_norm=False) - - parser.add_argument( - "--disable_gelu", - required=False, - action="store_true", - help="disable Gelu fusion", - ) - parser.set_defaults(disable_gelu=False) - - parser.add_argument( - "--enable_gelu_approximation", - required=False, - action="store_true", - help="enable Gelu/BiasGelu to FastGelu conversion", - ) - parser.set_defaults(enable_gelu_approximation=False) - - parser.add_argument( - "--disable_shape_inference", - required=False, - action="store_true", - help="disable symbolic shape inference", - ) - parser.set_defaults(disable_shape_inference=False) - - parser.add_argument( - "--use_mask_index", - required=False, - action="store_true", - help="use mask index instead of raw attention mask in attention operator", - ) - parser.set_defaults(use_mask_index=False) - - parser.add_argument( - "--no_attention_mask", - required=False, - action="store_true", - help="no attention mask. Only works for model_type=bert", - ) - parser.set_defaults(no_attention_mask=False) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py deleted file mode 100644 index 9afa3edbc37f2ddd7b15c3eb976ee1cd9e72e356..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py +++ /dev/null @@ -1,527 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple - -import numpy as np -from onnx import NodeProto, helper - -from .fusion_attention import AttentionMask -from .fusion_base import Fusion -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionQOrderedAttention(Fusion): - def __init__( - self, - model: OnnxModel, - hidden_size: int, - num_heads: int, - attention_mask: AttentionMask, - ): - self.hidden_size = hidden_size - self.num_heads = num_heads - self.attention_mask = attention_mask - - super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization") - - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - Args: - reshape_q (NodeProto): reshape node for Q - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape = self.model.get_initializer(reshape_q.input[1]) - if q_shape is None: - logger.debug(f"{reshape_q.input[1]} is not initializer.") - - # Check if the second input to Reshape flows through a Constant node - # TODO: Investigate why FusionAttention doesn't have such logic - constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1]) - - if constant_node is None: - return ( - self.num_heads, - self.hidden_size, - ) # Fall back to user specified value - else: - constant_node = constant_node[0] - - if len(constant_node.attribute) != 1: - return ( - self.num_heads, - self.hidden_size, - ) # Fall back to user specified value - - # This is assuming it is a Tensor attribute (this is a safe assumption) - q_shape = constant_node.attribute[0].t - - q_shape_value = NumpyHelper.to_array(q_shape) - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug( - f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return self.num_heads, self.hidden_size # Fall back to user specified value - - num_heads = q_shape_value[2] - head_size = q_shape_value[3] - hidden_size = num_heads * head_size - - if self.num_heads > 0 and num_heads != self.num_heads: - if self.num_heads_warning: - logger.warning( - f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value." - ) - self.num_heads_warning = False # Do not show the warning more than once - - if self.hidden_size > 0 and hidden_size != self.hidden_size: - if self.hidden_size_warning: - logger.warning( - f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value." - ) - self.hidden_size_warning = ( - False # Do not show the warning more than once - ) - - return num_heads, hidden_size - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - add_before_layernorm = self.model.match_parent_path( - normalize_node, - ["QuantizeLinear", "Add"], - [0, 0], - ) - - if add_before_layernorm is not None: - start_node = add_before_layernorm[-1] - else: - return - - # Input QDQ nodes - dequantize_input = self.model.match_parent_path( - start_node, - ["DequantizeLinear"], - [None], - ) - - if dequantize_input is None: - logger.debug( - "fuse_qordered_attention: failed to match input qdq nodes path" - ) - return - - dequantize_input = dequantize_input[-1] - - # QKV nodes - qkv_nodes = self.model.match_parent_path( - start_node, - [ - "Add", - "MatMul", - "Reshape", - "Transpose", - "DequantizeLinear", - "QuantizeLinear", - "MatMul", - ], - [None, None, 0, 0, 0, 0, 0], - ) - - if qkv_nodes is None: - logger.debug("fuse_qordered_attention: failed to match qkv path") - return - - ( - _, - projection_matmul, - reshape_qkv, - transpose_qkv, - dequantize_qkv, - quantize_qkv, - matmul_qkv, - ) = qkv_nodes - - # Make sure the Q/DQ has the proper zero points and constant per-tensor scales - if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model): - return - - # Identify the root input to the Attention node - other_inputs = [] - for i, input in enumerate(start_node.input): - if input not in output_name_to_node: - continue - - if input == qkv_nodes[0].output[0]: - continue - - other_inputs.append(input) - - if len(other_inputs) != 1: - return - - root_input = other_inputs[0] - - # V nodes - v_nodes = self.model.match_parent_path( - matmul_qkv, - [ - "Transpose", - "Reshape", - "DequantizeLinear", - "QuantizeLinear", - "Add", - "MatMul", - ], - [1, 0, 0, 0, 0, None], - ) - - if v_nodes is None: - logger.debug("fuse_qordered_attention: failed to match v path") - return - - (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes - - # Make sure the Q/DQ has the proper zero points and constant per-tensor scales - if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model): - return - - # V MatMul weight - dequantize_v_matmul_weight = self.model.match_parent_path( - matmul_v, ["DequantizeLinear"], [1] - ) - - if dequantize_v_matmul_weight is None: - logger.debug("fuse_qordered_attention: failed to match v path") - return - - dequantize_v_matmul_weight = dequantize_v_matmul_weight[0] - - if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None: - return - - # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales - # Per-channel scales are supported for weights alone - if not FusionUtils.check_qdq_node_for_fusion( - dequantize_v_matmul_weight, self.model, False - ): - return - - # QK nodes - qk_nodes = self.model.match_parent_path( - matmul_qkv, - [ - "DequantizeLinear", - "QuantizeLinear", - "Softmax", - "Add", - "Div", - "DequantizeLinear", - "QuantizeLinear", - "MatMul", - ], - [0, 0, 0, 0, None, 0, 0, 0], - ) - - if qk_nodes is None: - logger.debug("fuse_qordered_attention: failed to match qk path") - return - - ( - dequantize_qk_softmax, - quantize_qk_softmax, - softmax_qk, - add_qk, - div_qk, - dequantize_qk, - quantize_qk, - matmul_qk, - ) = qk_nodes - - # Make sure the Q/DQ has the proper zero points and constant per-tensor scales - if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model): - return - - # Q nodes - q_nodes = self.model.match_parent_path( - matmul_qk, - [ - "Transpose", - "Reshape", - "DequantizeLinear", - "QuantizeLinear", - "Add", - "MatMul", - ], - [0, 0, 0, 0, 0, None], - ) - - if q_nodes is None: - logger.debug("fuse_qordered_attention: failed to match q path") - return - - (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes - - # Make sure the Q/DQ has the proper zero points and constant per-tensor scales - if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model): - return - - # Q MatMul weight - dequantize_q_matmul_weight = self.model.match_parent_path( - matmul_q, ["DequantizeLinear"], [1] - ) - - if dequantize_q_matmul_weight is None: - logger.debug("fuse_qordered_attention: failed to match q path") - return - - dequantize_q_matmul_weight = dequantize_q_matmul_weight[0] - - if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None: - return - - # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales - # Per-channel scales are supported for weights alone - if not FusionUtils.check_qdq_node_for_fusion( - dequantize_q_matmul_weight, self.model, False - ): - return - - # K nodes - k_nodes = self.model.match_parent_path( - matmul_qk, - [ - "Transpose", - "Reshape", - "DequantizeLinear", - "QuantizeLinear", - "Add", - "MatMul", - ], - [1, 0, 0, 0, 0, None], - ) - - if k_nodes is None: - logger.debug("fuse_qordered_attention: failed to match k path") - return - - (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes - - # Make sure the Q/DQ has the proper zero points and constant per-tensor scales - if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model): - return - - if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model): - return - - # K MatMul weight - dequantize_k_matmul_weight = self.model.match_parent_path( - matmul_k, ["DequantizeLinear"], [1] - ) - - if dequantize_k_matmul_weight is None: - logger.debug("fuse_qordered_attention: failed to match k path") - return - - dequantize_k_matmul_weight = dequantize_k_matmul_weight[0] - - if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None: - return - - # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales - # Per-channel scales are supported for weights alone - if not FusionUtils.check_qdq_node_for_fusion( - dequantize_k_matmul_weight, self.model, False - ): - return - - # Mask nodes - mask_nodes = self.model.match_parent_path( - add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0] - ) - - if mask_nodes is None: - logger.debug("fuse_qordered_attention: failed to match mask_nodes path") - return - - # Ascertain `qkv_hidden_sizes` attribute value - q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0]) - k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0]) - v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0]) - - qw = NumpyHelper.to_array(q_weight) - kw = NumpyHelper.to_array(k_weight) - vw = NumpyHelper.to_array(v_weight) - - qw_out_size = np.prod(qw.shape[1:]) - kw_out_size = np.prod(kw.shape[1:]) - vw_out_size = np.prod(vw.shape[1:]) - - # Form QOrderedAttention node - if ( - matmul_v.input[0] == root_input - and matmul_q.input[0] == root_input - and matmul_k.input[0] == root_input - ): - mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) - - # Ascertain `num_heads` and `hidden_size` - num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q) - - # Formulate the inputs - # Actual quantized input - attention_inputs = [dequantize_input.input[0]] - attention_inputs.append(dequantize_input.input[1]) - - attention_inputs.append(dequantize_q.input[1]) - attention_inputs.append(dequantize_k.input[1]) - attention_inputs.append(dequantize_v.input[1]) - - attention_inputs.append(dequantize_q_matmul_weight.input[0]) - attention_inputs.append(dequantize_k_matmul_weight.input[0]) - attention_inputs.append(dequantize_v_matmul_weight.input[0]) - - attention_inputs.append(dequantize_q_matmul_weight.input[1]) - attention_inputs.append(dequantize_k_matmul_weight.input[1]) - attention_inputs.append(dequantize_v_matmul_weight.input[1]) - - if self.model.get_initializer(add_q.input[0]): - attention_inputs.append(add_q.input[0]) - else: # second input is the constant bias - attention_inputs.append(add_q.input[1]) - - if self.model.get_initializer(add_k.input[0]): - attention_inputs.append(add_k.input[0]) - else: # second input is the constant bias - attention_inputs.append(add_k.input[1]) - - if self.model.get_initializer(add_v.input[0]): - attention_inputs.append(add_v.input[0]) - else: # second input is the constant bias - attention_inputs.append(add_v.input[1]) - - attention_inputs.append(quantize_qk.input[1]) - attention_inputs.append(quantize_qk_softmax.input[1]) - attention_inputs.append(dequantize_qkv.input[1]) - - # Mask input - if mask_index is not None: - attention_inputs.append(mask_index) - else: - attention_inputs.append("") - - # The MatMul weight 'B' and 'bias' need some post-processing - # Transpose weight 'B' from order ROW to order COL - # This offline transpose is needed only while using the CUDA EP - # TODO: Make this fusion logic EP-agnostic ? - q_weight_tensor = self.model.get_initializer( - dequantize_q_matmul_weight.input[0] - ) - FusionUtils.transpose_2d_int8_tensor(q_weight_tensor) - - k_weight_tensor = self.model.get_initializer( - dequantize_k_matmul_weight.input[0] - ) - FusionUtils.transpose_2d_int8_tensor(k_weight_tensor) - - v_weight_tensor = self.model.get_initializer( - dequantize_v_matmul_weight.input[0] - ) - FusionUtils.transpose_2d_int8_tensor(v_weight_tensor) - - # Name and create Attention node - attention_node_name = self.model.create_node_name("QOrderedAttention") - - attention_node = helper.make_node( - "QOrderedAttention", - inputs=attention_inputs, - outputs=[reshape_qkv.output[0]], - name=attention_node_name, - ) - - self.model.replace_node_input( - dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0] - ) - self.model.replace_node_input( - projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0] - ) - - attention_node.attribute.extend( - [helper.make_attribute("num_heads", num_heads)] - ) - attention_node.attribute.extend([helper.make_attribute("order_input", 1)]) - attention_node.attribute.extend([helper.make_attribute("order_weight", 0)]) - attention_node.attribute.extend([helper.make_attribute("order_output", 1)]) - attention_node.attribute.extend( - [ - helper.make_attribute( - "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size] - ) - ] - ) - - attention_node.domain = "com.microsoft" - - self.nodes_to_add.append(attention_node) - self.node_name_to_graph_name[attention_node.name] = self.this_graph_name - - self.nodes_to_remove.extend( - [reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv] - ) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes) - self.nodes_to_remove.extend(k_nodes) - self.nodes_to_remove.extend(v_nodes) - self.nodes_to_remove.extend( - [ - dequantize_q_matmul_weight, - dequantize_k_matmul_weight, - dequantize_v_matmul_weight, - ] - ) - - # Use prune graph to remove mask nodes since they are shared by all attention nodes. - # self.nodes_to_remove.extend(mask_nodes) - self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py deleted file mode 100644 index ebd165c4bc5da002eb53b2376c1e69facf40dec4..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Dict - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionQOrderedGelu(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"]) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - INPUT PATTERN - Fuse (quantized) Gelu subgraph into one node QOrderedGelu: - -> quantized input -> DQ -> Gelu -> Q -> - - (or) - - -> quantized input -> DQ -> FastGelu -> Q -> - - OUTPUT PATTERN - -> QOrderedGelu -> - """ - gelu_children = self.model.get_children(node, input_name_to_nodes) - - # Should only have 1 child - QuantizeLinear (or) - # Should have 2 children - QuantizeLinear + Shape - if not ( - (len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear") - or ( - len(gelu_children) == 2 - and gelu_children[0].op_type == "QuantizeLinear" - and gelu_children[1].op_type == "Shape" - ) - ): - return - - downstream_quantize_node = gelu_children[0] - downstream_shape_node = None - - if len(gelu_children) == 2: - downstream_shape_node = gelu_children[1] - - if not FusionUtils.check_qdq_node_for_fusion( - downstream_quantize_node, self.model - ): - return - - # The first input to Gelu should flow through a DequantizeLinear node - first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths( - node, - [(["DequantizeLinear"], [0])], - output_name_to_node, - ) - - if first_path_id < 0: - return - - upstream_dequantize_node = first_input_parent_nodes[0] - - if not FusionUtils.check_qdq_node_for_fusion( - upstream_dequantize_node, self.model - ): - return - - # Fusion logic - subgraph_nodes = [node] # Gelu/FastGelu - subgraph_nodes.extend( - [downstream_quantize_node, upstream_dequantize_node] - ) # Relevant Q, DQ nodes - - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - [node.output[0], downstream_quantize_node.output[0]] - if downstream_shape_node is not None - else downstream_quantize_node.output, - input_name_to_nodes, - output_name_to_node, - ): - logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip") - return - - self.nodes_to_remove.extend(subgraph_nodes) - - ordered_gelu_node = helper.make_node( - "QOrderedGelu", - inputs=[ - upstream_dequantize_node.input[0], - upstream_dequantize_node.input[1], - downstream_quantize_node.input[1], - ], - outputs=[downstream_quantize_node.output[0]], - name=self.model.create_node_name( - "QOrderedGelu", name_prefix="QOrderedGelu" - ), - ) - - # Arrange the downstream Shape's input to be fed from the - # downstream QuantizeLinear node, so that fusion will - # be deemed safe - if downstream_shape_node is not None: - self.model.replace_node_input( - downstream_shape_node, - downstream_shape_node.input[0], - downstream_quantize_node.output[0], - ) - - # TODO: We only support CuBlasLt order ORDER_ROW for now. - # Once we start supporting other data ordering format(s), we - # will support user configuring the data ordering for the op. - ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)]) - ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)]) - - ordered_gelu_node.domain = "com.microsoft" - - self.nodes_to_add.append(ordered_gelu_node) - self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py deleted file mode 100644 index 94e38a0f5b549cb217359926172eb4aa510ad68b..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger -from typing import Dict - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionQOrderedLayerNormalization(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization") - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization: - quantized input -> DQ - | - | - (other inputs)-> LayerNormalization --> Q --> - - should become - - (quantized input + other inputs)-> QOrderedLayerNormalization --> Q --> - """ - - children = self.model.get_children(node, input_name_to_nodes) - - # Should only have 1 child - QuantizeLinear (or) - # Should have 2 children - QuantizeLinear + Shape - if not ( - (len(children) == 1 and children[0].op_type == "QuantizeLinear") - or ( - len(children) == 2 - and children[0].op_type == "QuantizeLinear" - and children[1].op_type == "Shape" - ) - ): - return - - downstream_quantize_node = children[0] - downstream_shape_node = None - - if len(children) == 2: - downstream_shape_node = children[1] - - if not FusionUtils.check_qdq_node_for_fusion( - downstream_quantize_node, self.model - ): - return - - # The first input to LayerNormalization should flow through a DequantizeLinear node - first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths( - node, - [(["DequantizeLinear"], [0])], - output_name_to_node, - ) - - if first_path_id < 0: - return - - upstream_dequantize_node = first_input_parent_nodes[0] - - if not FusionUtils.check_qdq_node_for_fusion( - upstream_dequantize_node, self.model - ): - return - - # Fusion logic - subgraph_nodes = [node] # LayerNormalization - subgraph_nodes.extend( - [downstream_quantize_node] - ) # Q node after LayerNormalization - - upstream_dequantize_node_children = self.model.get_children( - upstream_dequantize_node, input_name_to_nodes - ) - - # In GPT2, the DQ node will be feeding a residual downstream Add and hence, - # we do not want to remove it - if len(upstream_dequantize_node_children) == 1: - subgraph_nodes.extend( - [upstream_dequantize_node] - ) # DQ node before LayerNormalization - - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - [node.output[0], downstream_quantize_node.output[0]] - if downstream_shape_node is not None - else downstream_quantize_node.output, - input_name_to_nodes, - output_name_to_node, - ): - logger.debug( - f"It is not safe to fuse QOrderedLayerNormalization node. Skip" - ) - return - - self.nodes_to_remove.extend(subgraph_nodes) - - normalize_node = helper.make_node( - "QOrderedLayerNormalization", - inputs=[ - upstream_dequantize_node.input[0], - upstream_dequantize_node.input[1], - node.input[1], - node.input[2], - downstream_quantize_node.input[1], - ], - outputs=[downstream_quantize_node.output[0]], - name=self.model.create_node_name( - "QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization" - ), - ) - - # Arrange the downstream Shape's input to be fed from the - # downstream QuantizeLinear node, so that fusion will - # be deemed safe - if downstream_shape_node is not None: - self.model.replace_node_input( - downstream_shape_node, - downstream_shape_node.input[0], - downstream_quantize_node.output[0], - ) - - # TODO: We only support CuBlasLt order ORDER_ROW for now. - # Once we start supporting other data ordering format(s), we - # will support user configuring the data ordering for the op. - normalize_node.attribute.extend([helper.make_attribute("order_X", 1)]) - normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)]) - - normalize_node.domain = "com.microsoft" - - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py deleted file mode 100644 index 8c8050e1cdfb0061b734b1224aa0006b1c09cdef..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Dict - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionQOrderedMatMul(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "QOrderedMatMul", "MatMul") - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - matmul_children = self.model.get_children(node, input_name_to_nodes) - - # Should only have 1 child - Bias Add - if len(matmul_children) != 1 or matmul_children[0].op_type != "Add": - return - - bias_add_node = matmul_children[0] - - # Atleast one of the inputs to Bias Add node must be a constant - bias_add_node_index = 0 - if ( - self.model.get_constant_value(bias_add_node.input[0]) is None - and self.model.get_constant_value(bias_add_node.input[1]) is None - ): - return - - if self.model.get_constant_value(bias_add_node.input[0]) is None: - bias_add_node_index = 1 - - bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes) - - if len(bias_add_children) != 1: - return - - bias_add_child = bias_add_children[0] - - # Bias Add can have another Add downstream (Residual Add layer) - residual_add_node = None - - downstream_quantize_node = None - - if bias_add_child.op_type == "Add": - residual_add_node = bias_add_child - - residual_add_children = self.model.get_children( - residual_add_node, input_name_to_nodes - ) - - if ( - len(residual_add_children) != 1 - or residual_add_children[0].op_type != "QuantizeLinear" - ): - return - - downstream_quantize_node = residual_add_children[0] - - elif bias_add_child.op_type == "QuantizeLinear": - downstream_quantize_node = bias_add_child - - else: - return - - # Make sure the downstream QuantizeLinear has the proper zero points and scales - if not FusionUtils.check_qdq_node_for_fusion( - downstream_quantize_node, self.model - ): - return - - # The first input to MatMul should flow through a DequantizeLinear node - first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths( - node, - [(["DequantizeLinear"], [0])], - output_name_to_node, - ) - - # If Attention is not fused, this is the pattern to look for - # leading upto the MatMul - reshape_node_0 = None - transpose_node_0 = None - if first_path_id < 0: - first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths( - node, - [ - ( - ["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"], - [0, 0, 0, 0], - ) - ], - output_name_to_node, - ) - - if first_path_id < 0: - return - - reshape_node_0 = first_input_parent_nodes[0] - transpose_node_0 = first_input_parent_nodes[1] - dequantize_node_0 = first_input_parent_nodes[2] - else: - dequantize_node_0 = first_input_parent_nodes[0] - - # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales - if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model): - return - - # The second input to MatMul should flow through a DequantizeLinear node - dequantize_node_1 = None - is_weight_transpose_required = True - - weight_path_id, weight_nodes, _ = self.model.match_parent_paths( - node, - [ - ( - [ - "DequantizeLinear", - "QuantizeLinear", - "Transpose", - "DequantizeLinear", - ], - [1, 0, 0, 0], - ) - ], - output_name_to_node, - ) - - if weight_path_id < 0: - weight_path_id, weight_nodes, _ = self.model.match_parent_paths( - node, - [(["DequantizeLinear"], [1])], - output_name_to_node, - ) - - if weight_path_id < 0: - return - - dequantize_node_1 = weight_nodes[0] - else: - is_weight_transpose_required = False - dequantize_node_1 = weight_nodes[3] - - # Check if weight 'B' is a constant - if self.model.get_constant_value(dequantize_node_1.input[0]) is None: - return - - # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales - # Per-channel scales are supported for weights alone - if not FusionUtils.check_qdq_node_for_fusion( - dequantize_node_1, self.model, False - ): - return - - # Make sure the upstream flow into the Residual Add node flows through a DQ node - residual_add_dequantize_node = None - - if residual_add_node is not None: - ( - residual_path_id, - residual_input_parent_nodes, - _, - ) = self.model.match_parent_paths( - residual_add_node, - [ - (["DequantizeLinear"], [1]), - ], - output_name_to_node, - ) - - if residual_path_id < 0: - return - - residual_add_dequantize_node = residual_input_parent_nodes[0] - - # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales - if ( - residual_add_dequantize_node is not None - and not FusionUtils.check_qdq_node_for_fusion( - residual_add_dequantize_node, self.model - ) - ): - return - - # Subgraph nodes to be fused - subgraph_nodes = [node, bias_add_node] # MatMul + Bias Add - - if residual_add_node is not None: - subgraph_nodes.extend([residual_add_node]) # Residual Add - - subgraph_nodes.extend(weight_nodes) - subgraph_nodes.extend([downstream_quantize_node]) # Downstream Q node - - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - downstream_quantize_node.output, - input_name_to_nodes, - output_name_to_node, - ): - logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip") - return - - # Deal with the case where-in the Attention subgraph is not fused - if transpose_node_0 is not None: - self.model.replace_node_input( - transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0] - ) - - # Make inputs - fused_node_inputs = [ - reshape_node_0.output[0] - if reshape_node_0 is not None - else dequantize_node_0.input[0], - dequantize_node_0.input[1], - dequantize_node_1.input[0], - dequantize_node_1.input[1], - downstream_quantize_node.input[1], - bias_add_node.input[bias_add_node_index], - ] - - if residual_add_node is not None: - fused_node_inputs.append(residual_add_dequantize_node.input[0]) - fused_node_inputs.append(residual_add_dequantize_node.input[1]) - - # The MatMul weight 'B' and 'bias' need some post-processing - # Transpose weight 'B' from order ROW to order COL - # This offline transpose is needed only while using the CUDA EP - # TODO: Make this fusion logic EP-agnostic ? - if is_weight_transpose_required: - weight_tensor = self.model.get_initializer(dequantize_node_1.input[0]) - FusionUtils.transpose_2d_int8_tensor(weight_tensor) - - fused_node = helper.make_node( - "QOrderedMatMul", - inputs=fused_node_inputs, - outputs=[downstream_quantize_node.output[0]], - name=self.model.create_node_name( - "QOrderedMatMul", name_prefix="QOrderedMatMul" - ), - ) - - fused_node.attribute.extend([helper.make_attribute("order_A", 1)]) - fused_node.attribute.extend([helper.make_attribute("order_B", 0)]) - fused_node.attribute.extend([helper.make_attribute("order_Y", 1)]) - - fused_node.domain = "com.microsoft" - - self.nodes_to_remove.extend(subgraph_nodes) - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py deleted file mode 100644 index 2a5bf73fdf07f223be18e7bbaf20f9623ebb3fdc..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger - -import numpy as np -from onnx import TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionReshape(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "Reshape", "Reshape") - self.prune_graph: bool = False - - def replace_reshape_node(self, shape, reshape_node, concat_node): - shape_value = np.asarray([int(x) if isinstance(x, np.ndarray) else x for x in shape], dtype=np.int64) - constant_shape_name = self.model.create_node_name("Constant", "constant_shape") - new_node = helper.make_node( - "Constant", - inputs=[], - outputs=[constant_shape_name], - value=helper.make_tensor( - name="const_tensor", - data_type=TensorProto.INT64, - dims=shape_value.shape, - vals=bytes(shape_value), - raw=True, - ), - ) - reshape_node.input[1] = constant_shape_name - reshape_node.name = self.model.create_node_name("Reshape", "Reshape_Fuse") - self.nodes_to_remove.extend([concat_node]) - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node): - if reshape_node.input[1] not in output_name_to_node: - return - - concat_node = output_name_to_node[reshape_node.input[1]] - if ( - concat_node.op_type != "Concat" - or len(concat_node.input) < 3 - or len(concat_node.input) > 4 - ): - return - - path0 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Gather", "Shape"], - [0, 0, 0], - output_name_to_node, - ) - if path0 is None: - return - - (unsqueeze_0, gather_0, shape_0) = path0 - - path1 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Gather", "Shape"], - [1, 0, 0], - output_name_to_node, - ) - if path1 is None: - return - (unsqueeze_1, gather_1, shape_1) = path1 - - shape = [] - gather_value = self.model.get_constant_value(gather_0.input[1]) - if gather_value == 0: - shape.append(0) - - gather_value = self.model.get_constant_value(gather_1.input[1]) - if gather_value == 1: - shape.append(0) - - if len(shape) != 2: - return - - path2 = [] - path3 = [] - shape_nodes = [shape_0, shape_1] - if ( - len(concat_node.input) == 3 - and self.model.get_initializer(concat_node.input[2]) is None - ): - path2 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Mul", "Gather", "Shape"], - [2, 0, 0, 0], - output_name_to_node, - ) - if path2 is None: - path2 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"], - [2, 0, 0, 0, 0], - output_name_to_node, - ) # GPT2 exported by PyTorch 1.4 with opset_version=11 - if path2 is None: - return - - path3 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Mul", "Gather", "Shape"], - [2, 0, 1, 0], - output_name_to_node, - ) - if path3 is None: - path3 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"], - [2, 0, 1, 0, 0], - output_name_to_node, - ) # GPT2 exported by PyTorch 1.4 with opset_version=11 - if path3 is None: - return - - shape_nodes.extend([path2[-1], path3[-1]]) - shape.append(-1) - elif len(concat_node.input) > 2: - concat_2 = self.model.get_initializer(concat_node.input[2]) - if concat_2 is None: - return - concat_value = numpy_helper.to_array(concat_2) - if isinstance(concat_value, list): - shape.extend(concat_value) - else: - shape.append(concat_value) - - if ( - len(concat_node.input) == 4 - and self.model.get_initializer(concat_node.input[3]) is None - ): - if -1 in shape: - return - - path2 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Div", "Gather", "Shape"], - [3, 0, 0, 0], - output_name_to_node, - ) - if path2 is None: - path2 = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Div", "Squeeze", "Slice", "Shape"], - [3, 0, 0, 0, 0], - output_name_to_node, - ) # GPT2 exported by PyTorch 1.4 with opset_version=11 - if path2 is None: - return - shape_nodes.extend([path2[-1]]) - shape.append(-1) - elif len(concat_node.input) > 3: - concat_3 = self.model.get_initializer(concat_node.input[3]) - if concat_3 is None: - return - - concat_value = numpy_helper.to_array(concat_3) - if isinstance(concat_value, list): - shape.extend(concat_value) - else: - shape.append(concat_value) - - root_input = reshape_node.input[0] - same_shape_input = True - for shape_node in shape_nodes: - if shape_node.input[0] != root_input: - same_shape_input = False - - if not same_shape_input: - return - - self.replace_reshape_node(shape, reshape_node, concat_node) - - # TODO(tlwu): Subgraph blocks pruning un-used nodes. Add code to remove un-used nodes safely. - self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py deleted file mode 100644 index b3ec51a5a25af26a36ef9fc0015b80104e4cd67f..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -import logging -from typing import Dict - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = logging.getLogger(__name__) - - -class FusionRMSNorm(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "RMSNorm", "Mul") - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - if node.op_type != "Mul": - return - - sim_ln_nodes = None - # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary): - # DD = Pow(D, 2) - # Var = ReduceMean(DD) - # VarEps = Add(Var, epsilon) - # StdDev = Sqrt(VarEps) - # InvStdDev = Div(1, StdDev) - # Normalized = Mul(D, InvStdDev) - # NormalizedScaled = Mul(Normalized, Scale) - - # RMSNorm - # +-------------------------------------------------------+ - # | | - # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_1 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"], - [1, 1, 1, 0, 0, 0, 0], - ) - # RMSNorm - # +-------------------------------------------------------+ - # | | - # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_2 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"], - [1, 1, 1, 0, 0, 0, 0], - ) - - # For LLaMA from Microsoft custom export: - # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1 - # - # RMSNorm - # +-------------------------------------------------------+ - # | | - # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_3 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"], - [0, 1, 1, 0, 0, 0, 0], - ) - - # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3 - # - # RMSNorm - # +-----------------------------------------------+ - # | | - # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul - # | - # node - sim_ln_nodes_4 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"], - [0, 1, 1, 0, 0, 0], - ) - - add_node, pow_node = None, None - if sim_ln_nodes_1 is not None: - sim_ln_nodes = sim_ln_nodes_1 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] - elif sim_ln_nodes_2 is not None: - sim_ln_nodes = sim_ln_nodes_2 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] - elif sim_ln_nodes_3 is not None: - sim_ln_nodes = sim_ln_nodes_3 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] - elif sim_ln_nodes_4 is not None: - sim_ln_nodes = sim_ln_nodes_4 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-1] - # Verify that parent input to Pow node is graph_input - if pow_node.input[0] not in self.model.get_graphs_input_names(): - return - else: - return - - layernorm_weight_index = ( - 1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0 - ) - starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4 - - if self.model.find_constant_input(pow_node, 2.0) != 1: - return - - root_input = pow_node.input[0] - if root_input != sim_ln_nodes[0].input[0]: - return - - i, add_weight = self.model.get_constant_input(add_node) - if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4: - logger.warning(f"epsilon value is not expected: {add_weight}") - return - - self.nodes_to_remove.extend( - sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes - ) - self.nodes_to_remove.append(node) - - normalize_node = helper.make_node( - "RMSNormPluginDynamic_IxRT", - inputs=[root_input, node.input[layernorm_weight_index]], - outputs=[node.output[0]], - name=self.model.create_node_name( - "RMSNormPluginDynamic_IxRT", name_prefix="RMSNorm_" - ), - ) - - normalize_node.domain = "com.iluvatar" - normalize_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - normalize_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - normalize_node.attribute.extend( - [helper.make_attribute("epsilon", float(add_weight))] - ) - normalize_node.attribute.extend([helper.make_attribute("axis", -1)]) - normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)]) - gamma_data = self.model.get_initializer(normalize_node.input[1]) - gamma_data_np = NumpyHelper.to_array(gamma_data) - normalize_node.attribute.extend( - [helper.make_attribute("hidden_size", int(gamma_data_np.shape[0]))] - ) - - normalize_node.attribute.extend([helper.make_attribute("gamma", gamma_data)]) - - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name - return True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py deleted file mode 100644 index 1d99595e8e8d9dc1cde4da1c66f266251d0919ca..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py +++ /dev/null @@ -1,371 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionRoformerCrossAttention(Fusion): - """ - Fuse VideoBertAttention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQkvCrossToContext_IxRT", - ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size( - self, custom_fc: NodeProto, mul: NodeProto - ) -> Tuple[int, int]: - mul_initializer = self.model.get_initializer(mul.input[1]) - - # 检查float_data是否为空 - if len(mul_initializer.float_data) > 0: - mul_value = mul_initializer.float_data[0] - else: - # 如果float_data为空,尝试其他方式获取数据 - # 例如,如果数据存储在raw_data中 - if len(mul_initializer.raw_data) > 0: - dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type] - mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0] - else: - raise ValueError("Data not found in the mul_initializer") - - for attr in custom_fc.attribute: - if attr.name == "W": - tensor_value = attr.t - tensor_shape = [dim for dim in tensor_value.dims] - break - head_dim = math.floor(1.0 / (mul_value * mul_value)) - hidden_size = tensor_shape[0] - num_heads = hidden_size // head_dim - - return num_heads, hidden_size - - def create_attention_node( - self, - num_heads: int, - hidden_size: int, - input_q: str, - input_k: str, - input_v: str, - input_mask: str, - output: str, - matmul_qk_add: NodeProto, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input_q: str, - input_k: str, - input_v: str, - input_mask: str, - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - attention_node_name = self.model.create_node_name("CrossAttention") - - attention_inputs = [input_q, input_k, input_v, input_mask] - - attention_node = helper.make_node( - "CustomQkvCrossToContext_IxRT", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("has_mask", 1)]) - attention_node.attribute.extend([helper.make_attribute("type_mask", 4)]) #3:float mask 4:int32 mask - attention_node.attribute.extend([helper.make_attribute("scale", 1.0 / 8)]) #1 /sqrt(num_heads) - - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - - return attention_node - - def get_shape(self, edge_name): - for info in self.model.graph().value_info: - if info.name == edge_name: - return info.type.tensor_type.shape.dim - return None - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - - # SkipLayerNormalization has two inputs, and one of them is the root input for attention. - qkv_paths = { - "path1": ( - [ - "CustomFCPluginDynamic_IxRT", - "Reshape", - "Transpose", - "Reshape", - "MatMul", - ], - [0, 0, 0, 0, 0], - ), - "path2": ( - [ - "CustomFCPluginDynamic_IxRT", - "Reshape", - "Transpose", - "Reshape", - "MatMul", - ], - [1, 0, 0, 0, 0], - ), - } - # print('start_nodes:', start_node.name) - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - - fc_after_atten = None - if qkv_path in ["path1", "path2"]: - ( - fc_after_atten, - reshape_qkv_2, - transpose_qkv, - reshape_qkv_1, - matmul_qkv, - ) = qkv_nodes - - """ - Match - Add --> LayerNormalization --> Attention --> Add --> LayerNormalization - | | - | | - +--------------------------------------------------------- - """ - add_before_layernorm = self.model.match_parent(start_node, "Add", None) - if add_before_layernorm is not None: - node_children = input_name_to_nodes[add_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "LayerNormalization": - root_input = child.output[0] - - v_paths = {"path1": (["Reshape", "Transpose", "Reshape"], [1, 0, 0])} - - v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths) - if v_path == "path1": - (reshape_v, transpose_v, v_reshape) = v_nodes - - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - - qk_paths = { - "path1": ( - ["Softmax", "Add", "Mul", "Mul", "Reshape", "MatMul"], - [0, 0, None, None, None, 0], - ) - } - - qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths) - - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return - # print('qk_nodes', qk_nodes[0].name) - matmul_qk_add = None - if qk_path == "path1": - (_, add_mask, mul_mask, mul_qk, reshape_qk, matmul_qk) = qk_nodes - - q_paths = { - "path1": (["Transpose", "Add"], [0, 0]), - } - q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - # print('q_nodes', q_nodes[0].name) - if q_path == "path1": - (q_tranpose, q_add) = q_nodes - - k_paths = { - "path1": (["Reshape", "Transpose", "Add"], [1, 0, 0]), - } - k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths) - - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - # print('k_nodes', k_nodes[0].name) - if k_path == "path1": - (_, k_transpose, k_add) = k_nodes - # print('add_mask', add_mask.name) - mask_paths = { - "path1": ( - ["Mul", "Sub", "Unsqueeze", "Cast", "Greater"], - [1, None, 1, 0, 0], - ) - } - mask_nodes, mask_path = self.match_parent_path_from_dict(add_mask, mask_paths) - - if mask_nodes is None: - logger.debug("fuse_attention: failed to match mask path") - return - # print('mask_nodes', mask_nodes[0].name) - (_, mask_sub, mask_unsqueeze, mask_cast, mask_greater) = mask_nodes - - if ( - self.get_shape(q_add.output[0]) == self.get_shape(k_add.output[0]) - and self.get_shape(k_add.output[0]) == self.get_shape(v_reshape.output[0]) - and mul_mask.input[1] in mask_unsqueeze.output - ): - attention_last_node = reshape_qkv_1 - - num_heads, hidden_size = self.get_num_heads_and_hidden_size( - fc_after_atten, mul_qk - ) - - q_transpose_type = None - q_transpose_name = None - for info in self.model.graph().value_info: - if info.name == q_tranpose.output[0]: - q_transpose_type = info.type - q_transpose_name = info.name - break - - q_transpose_output = helper.make_value_info( - q_transpose_name[:-2] + "_fake_q", q_transpose_type - ) - q_transpose_node = helper.make_node( - "Transpose", - inputs=[q_add.output[0]], - outputs=[q_transpose_output.name], - name=q_transpose_output.name, - ) - q_transpose_node.attribute.extend( - [helper.make_attribute("perm", [0, 2, 1, 3])] - ) - - k_transpose_output = helper.make_value_info( - q_transpose_name[:-2] + "_fake_k", q_transpose_type - ) - k_transpose_node = helper.make_node( - "Transpose", - inputs=[k_add.output[0]], - outputs=[k_transpose_output.name], - name=k_transpose_output.name, - ) - k_transpose_node.attribute.extend( - [helper.make_attribute("perm", [0, 2, 1, 3])] - ) - - v_transpose_output = helper.make_value_info( - q_transpose_name[:-2] + "_fake_v", q_transpose_type - ) - v_transpose_node = helper.make_node( - "Transpose", - inputs=[v_reshape.output[0]], - outputs=[v_transpose_output.name], - name=v_transpose_output.name, - ) - v_transpose_node.attribute.extend( - [helper.make_attribute("perm", [0, 2, 1, 3])] - ) - - mask_type = None - for info in self.model.graph().value_info: - if info.name == mask_sub.output[0]: - mask_type = info.type - break - - new_mask_type = onnx.TypeProto() - new_mask_type.tensor_type.elem_type = onnx.TensorProto.INT32 - for dim in mask_type.tensor_type.shape.dim: - new_dim = new_mask_type.tensor_type.shape.dim.add() - new_dim.CopyFrom(dim) - - mask_cast_to_int32_output = helper.make_value_info( - mask_sub.name + "_cast_to_int32", new_mask_type - ) - mask_cast_to_int32_node = helper.make_node( - "Cast", - inputs=[mask_sub.output[0]], - outputs=[mask_cast_to_int32_output.name], - name=mask_cast_to_int32_output.name, - ) - mask_cast_to_int32_node.attribute.extend([helper.make_attribute("to", 6)]) - - new_node = self.create_attention_node( - num_heads, - hidden_size, - q_transpose_node.output[0], - k_transpose_node.output[0], - v_transpose_node.output[0], - mask_cast_to_int32_node.output[0], - attention_last_node.output[0], - matmul_qk_add, - ) - if new_node is None: - return - - self.nodes_to_add.extend( - [ - q_transpose_node, - k_transpose_node, - v_transpose_node, - new_node, - mask_cast_to_int32_node, - ] - ) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name - self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name - self.node_name_to_graph_name[v_transpose_node.name] = self.this_graph_name - self.node_name_to_graph_name[ - mask_cast_to_int32_node.name - ] = self.this_graph_name - - self.nodes_to_remove.extend(qkv_nodes[3:]) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes[:-1]) - self.nodes_to_remove.extend(k_nodes[:-1]) - self.nodes_to_remove.extend(v_nodes[:-1]) - self.nodes_to_remove.extend([mask_nodes[0]]) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py deleted file mode 100644 index dfa14d0e25951f7ce72c719c452ebb56232e14a7..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionRoPE(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "CustomRoPEPluginDynamic_IxRT", "Add") - - def fuse(self, start_node, input_name_to_nodes, output_name_to_node): - src_paths = {"path1": (["Mul", "Concat", "Split", "Slice"], [0, 1, None, 0])} - src_nodes, src_path = self.match_parent_path_from_dict(start_node, src_paths) - if src_nodes is None: - logger.debug("fuse_rope: failed to match src_node") - return - - src_node = src_nodes[0] - - rotate_paths = {"path1": (["Mul", "Reshape", "Concat"], [1, 0, 0])} - rotate_nodes, rotate_path = self.match_parent_path_from_dict( - start_node, rotate_paths - ) - - if rotate_nodes is None: - logger.debug("fuse_rope: failed to match rotate_path") - return - - concat_node = rotate_nodes[-1] - mul_right_node = rotate_nodes[0] - - odd_paths = {"path1": (["Unsqueeze", "Neg", "Slice", "Reshape"], [0, 0, 0, 0])} - odd_nodes, odd_path = self.match_parent_path_from_dict(concat_node, odd_paths) - - if odd_nodes is None: - logger.debug("fuse_rope: failed to match odd_path") - return - - even_paths = {"path1": (["Unsqueeze", "Slice", "Reshape"], [1, 0, 0])} - even_nodes, even_path = self.match_parent_path_from_dict( - concat_node, even_paths - ) - - if even_nodes is None: - logger.debug("fuse_rope: failed to match even_path") - return - reshape_node = even_nodes[-1] - - if reshape_node.output[0] == src_node.input[0]: - rope_node_name = self.model.create_node_name("RoPE") - rope_node = helper.make_node( - "CustomRoPEPluginDynamic_IxRT", - inputs=[ - reshape_node.output[0], - src_nodes[0].input[1], - mul_right_node.input[1], - ], - outputs=[start_node.output[0]], - name=rope_node_name, - ) - rope_node.domain = "com.iluvatar" - rope_node.attribute.extend([helper.make_attribute("type_id", 2)]) - rope_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - rope_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - - self.nodes_to_add.append(rope_node) - self.node_name_to_graph_name[rope_node.name] = self.this_graph_name - - self.nodes_to_remove.extend([start_node]) - self.nodes_to_remove.extend([src_nodes[0]]) - self.nodes_to_remove.extend(rotate_nodes) - self.nodes_to_remove.extend(odd_nodes[:-1]) - self.nodes_to_remove.extend(even_nodes[:-1]) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py deleted file mode 100644 index 727d4b82d44805f6d52c8e7fd72d94acf846e73e..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Dict, List, Union - -from onnx import NodeProto, TensorProto - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionShape(Fusion): - def __init__(self, model: OnnxModel): - super().__init__(model, "Shape", "Concat") - self.utils = FusionUtils(model) - self.shape_infer = None - self.shape_infer_done = False - - def get_dimensions_from_tensor_proto( - self, tensor_proto: TensorProto - ) -> Union[int, None]: - if tensor_proto.type.tensor_type.HasField("shape"): - return len(tensor_proto.type.tensor_type.shape.dim) - else: - return None - - def get_dimensions(self, input_name: str) -> Union[int, None]: - graph_input = self.model.find_graph_input(input_name) - if graph_input: - return self.get_dimensions_from_tensor_proto(graph_input) - - if not self.shape_infer_done: - self.shape_infer = self.model.infer_runtime_shape({}, update=True) - self.shape_infer_done = True - - if self.shape_infer is not None: - return self.get_dimensions_from_tensor_proto( - self.shape_infer.known_vi_[input_name] - ) - - return None - - def fuse( - self, - concat_node: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - output_name_to_node: Dict[str, NodeProto], - ): - """ - Smplify subgraph like - - (2d_input) - / \ - Shape shape - / \ - Gather(indices=0) Gather(indices=1) - | | - Unsqueeze(axes=0) Unsqueeze(axes=0) - \ / - Concat - | - - into (2d_input) --> Shape --> - """ - opset_version = self.model.get_opset_version() - - inputs = len(concat_node.input) - root = None - shape_output = None - for i in range(inputs): - path = self.model.match_parent_path( - concat_node, - ["Unsqueeze", "Gather", "Shape"], - [i, 0, 0], - output_name_to_node, - ) - if path is None: - return - - unsqueeze, gather, shape = path - if i == 0: - shape_output = shape.output[0] - if root is None: - root = shape.input[0] - if self.get_dimensions(root) != inputs: - return - elif shape.input[0] != root: - return - - if not FusionUtils.check_node_attribute( - unsqueeze, "axis", 0, default_value=0 - ): - return - - if opset_version < 13: - if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]): - return - else: - if not self.utils.check_node_input_value(unsqueeze, 1, [0]): - return - - value = self.model.get_constant_value(gather.input[1]) - from numpy import array_equal, ndarray - - if not ( - isinstance(value, ndarray) and value.size == 1 and value.item() == i - ): - return - - if self.model.find_graph_output(concat_node.output[0]) is None: - self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output) - self.fused_count += 1 - self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py deleted file mode 100644 index d0797b26dc6edfabd91f4bd9d07d0c1da383ef8b..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger - -from onnx import helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionSkipLayerNormalization(Fusion): - """ - Fuse Add + LayerNormalization into one node: SkipLayerNormalization - Note: This fusion does not check the input shape of Add and LayerNormalization. - """ - - def __init__(self, model: OnnxModel): - super().__init__( - model, "CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization" - ) - # Update shape inference is needed since other fusions might add new edge which does not have shape info yet. - self.shape_infer_helper = self.model.infer_runtime_shape( - {"batch_size": 4, "seq_len": 7}, update=True - ) - - if self.shape_infer_helper is None: - # TODO(tianleiwu): support subgraph in shape inference or add broadcasting in SkipLayerNormalization op. - logger.warning("symbolic shape inference disabled or failed.") - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - add = self.model.get_parent(node, 0, output_name_to_node) - - # In some models there is input_ids->gather->add->LayerNorm and one of input of the - # add node is initializer with fixed shape which should not be fused into SkipLayerNorm - if add is None: - return - - for add_input in add.input: - if self.model.get_initializer(add_input) != None: - return - - # The number of input node of add should be 2 - if len(self.model.get_parents(add)) != 2: - return - - if self.shape_infer_helper is not None: - if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]): - logger.debug( - "skip SkipLayerNormalization fusion since shape of inputs (%s, %s) are not same", - add.input[0], - add.input[1], - ) - return - else: - layernorm_weight = self.model.get_initializer(node.input[1]) - if layernorm_weight is not None: - layernorm_weight_arr = NumpyHelper.to_array(layernorm_weight) - hidden_size = layernorm_weight_arr.shape[0] - else: - logger.debug( - "skip SkipLayerNormalization fusion since symbolic shape inference failed" - ) - return - - # gather_path = self.model.match_parent_path(add, ["Gather"], [None]) - # if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None: - # if self.model.match_parent_path(gather_path[0], ["ConstantOfShape"], [1]) is None: - # return - - if ( - add is not None - and add.op_type == "Add" - and self.model.is_safe_to_fuse_nodes( - [add, node], node.output, input_name_to_nodes, output_name_to_node - ) - ): - self.nodes_to_remove.extend([add, node]) - - inputs = [add.input[0], add.input[1]] - normalize_node = helper.make_node( - "CustomSkipLayerNormPluginDynamic_IxRT", - inputs=inputs, - outputs=[node.output[0]], - name=self.model.create_node_name( - "SkipLayerNormalization", name_prefix="SkipLayerNorm" - ), - ) - normalize_node.domain = "com.iluvatar" - if self.shape_infer_helper is not None: - hidden_size = self.shape_infer_helper.get_edge_shape(node.input[1])[-1] - normalize_node.attribute.extend([helper.make_attribute("ld", hidden_size)]) - normalize_node.attribute.extend([helper.make_attribute("type_id", 2)]) - normalize_node.attribute.extend( - [ - helper.make_attribute( - "beta", self.model.get_initializer(node.input[2]) - ) - ] - ) - normalize_node.attribute.extend( - [ - helper.make_attribute( - "gamma", self.model.get_initializer(node.input[1]) - ) - ] - ) - normalize_node.attribute.extend( - [helper.make_attribute("plugin_namespace", "")] - ) - normalize_node.attribute.extend( - [helper.make_attribute("plugin_version", "1")] - ) - - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name - - -class FusionBiasSkipLayerNormalization(Fusion): - def __init__(self, model: OnnxModel): - super().__init__( - model, - "CustomSkipLayerNormPluginDynamic_IxRT", - "SkipLayerNormalization", - "add bias", - ) - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - if len(node.input) != 4: - return - - return_indice = [] - nodes = self.model.match_parent_path( - node, ["Add", "MatMul"], [None, None], None, return_indice - ) - if nodes is None: - return - assert len(return_indice) == 2 - add_input_index = return_indice[0] - if add_input_index >= 2: - return - - (add, matmul) = nodes - - # bias should be one dimension - bias_index = -1 - for i, input in enumerate(add.input): - initializer = self.model.get_initializer(input) - if initializer is None: - continue - bias_index = i - bias_weight = NumpyHelper.to_array(initializer) - break - if bias_weight is None: - logger.debug(f"Bias weight not found") - return - if len(bias_weight.shape) != 1: - logger.debug(f"Bias weight is not 1D") - return - - subgraph_nodes = [node, add] - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node - ): - logger.debug( - f"Skip fusing SkipLayerNormalization with Bias since it is not safe" - ) - return - - self.nodes_to_remove.extend(subgraph_nodes) - inputs = [ - node.input[1 - add_input_index], - matmul.output[0], - node.input[2], - node.input[3], - add.input[bias_index], - ] - new_node = helper.make_node( - "CustomSkipLayerNormPluginDynamic_IxRT", - inputs=inputs, - outputs=node.output, - name=self.model.create_node_name( - "SkipLayerNormalization", "SkipLayerNorm_AddBias_" - ), - ) - new_node.domain = "com.iluvatar" - hidden_size = self.shape_infer_helper.get_edge_shape(node.input[2])[-1] - new_node.attribute.extend([helper.make_attribute("ld", hidden_size)]) - new_node.attribute.extend([helper.make_attribute("type_id", 2)]) - new_node.attribute.extend( - [helper.make_attribute("beta", self.model.get_initializer(node.input[3]))] - ) - new_node.attribute.extend( - [helper.make_attribute("gamma", self.model.get_initializer(node.input[2]))] - ) - new_node.attribute.extend( - [ - helper.make_attribute( - "bias", self.model.get_initializer(add.input[bias_index]) - ) - ] - ) - new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - new_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py deleted file mode 100644 index 436257c3ce09b25790b132b6f918afebc63d9380..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple, Union - -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionSplitQKV(Fusion): - """ - Fuse FusionSplitQKV - """ - - def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int): - super().__init__(model, "SplitQKV_IxRT", "MatMul") - - self.hidden_size = hidden_size - self.num_heads = num_heads - - def create_splitqkv_node( - self, input: str, query_out: str, key_out: str, value_out: str - ) -> Union[NodeProto, None]: - """Create an XSoftmax node. - - Args: - data_input (str): data input name - mask_input (str): max input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - node_name = self.model.create_node_name("SplitQKV_IxRT") - - new_node = helper.make_node( - "SplitQKV_IxRT", - inputs=[input], - outputs=[query_out, key_out, value_out], - name=node_name, - ) - new_node.domain = "com.iluvatar" - new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - new_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - new_node.attribute.extend( - [helper.make_attribute("atten_scale", 1 / self.num_heads)] - ) - - return new_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - split_query_paths = { - "query_path": ( - ["Div", "Transpose", "Reshape", "Slice", "CustomFCPluginDynamic_IxRT"], - [0, 0, 0, 0, 0], - ), - } - - split_key_paths = { - "key_path": (["Transpose", "Reshape", "Slice"], [1, 0, 0]), - } - - q_nodes, q_path = self.match_parent_path_from_dict(node, split_query_paths) - - k_nodes, k_path = self.match_parent_path_from_dict(node, split_key_paths) - - if (q_nodes is not None) and (k_nodes is not None): - ( - q_div_node, - q_transpose_node, - q_reshape_node, - q_slice_node, - coustom_fc_node, - ) = q_nodes - k_transpose_node, k_reshape_node, k_slice_node = k_nodes - slice_nodes = self.model.get_children(coustom_fc_node) - - if len(slice_nodes) != 3: - return - slice_nodes.remove(q_slice_node) - slice_nodes.remove(k_slice_node) - v_slice_node = slice_nodes[0] - - node.input[0] = q_div_node.input[0] # dele div - new_node = self.create_splitqkv_node( - coustom_fc_node.output[0], - q_slice_node.output[0], - k_slice_node.output[0], - v_slice_node.output[0], - ) - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.nodes_to_remove.append(q_slice_node) - self.nodes_to_remove.append(k_slice_node) - self.nodes_to_remove.append(v_slice_node) - self.nodes_to_remove.append(q_div_node) - - else: - return diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py deleted file mode 100644 index 4152eef6e6371dd4da27b5315bf5bd741d0749d1..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py +++ /dev/null @@ -1,128 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple, Union - -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionSplitQKVUpdateKVCache(Fusion): - """ - Fuse FusionSplitQKVUpdateKVCache - """ - - def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int): - super().__init__( - model, "SplitQKVUpdateKVCache_IxRT", "CustomQkvCrossToContext_IxRT" - ) - - self.hidden_size = hidden_size - self.num_heads = num_heads - - def create_node( - self, - inputs: list, - outputs: list, - ) -> Union[NodeProto, None]: - """Create an XSoftmax node. - - Args: - data_input (str): data input name - mask_input (str): max input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - node_name = self.model.create_node_name("SplitQKVUpdateKVCache_IxRT") - - new_node = helper.make_node( - "SplitQKVUpdateKVCache_IxRT", - inputs=inputs, - outputs=outputs, - name=node_name, - ) - new_node.domain = "com.iluvatar" - new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - new_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)]) - new_node.attribute.extend( - [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)] - ) - - return new_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - query_paths = { - "query_path": ( - ["Transpose", "Reshape", "Split"], - [0, 0, None], - ), - } - - key_paths = { - "key_path": ( - ["Concat", "Transpose", "Reshape", "Split"], - [1, None, 0, None], - ), - } - - value_paths = { - "value_path": ( - ["Concat", "Transpose", "Reshape", "Split"], - [2, None, 0, None], - ), - } - - q_nodes, q_path = self.match_parent_path_from_dict(node, query_paths) - - k_nodes, k_path = self.match_parent_path_from_dict(node, key_paths) - - v_nodes, v_path = self.match_parent_path_from_dict(node, value_paths) - - if (q_nodes is not None) and (k_nodes is not None) and (v_nodes is not None): - (q_transpose_node, q_reshape_node, q_split_node) = q_nodes - (k_concat_node, k_transpose_node, k_reshape_node, k_split_node) = k_nodes - - (v_concat_node, v_transpose_node, v_reshape_node, v_split_node) = v_nodes - - inputs = [ - q_split_node.input[0], - k_concat_node.input[0], - v_concat_node.input[0], - ] - - outputs = [ - q_transpose_node.output[0], - k_concat_node.output[0], - v_concat_node.output[0], - ] - - new_node = self.create_node(inputs, outputs) - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.nodes_to_remove.append(q_transpose_node) - self.nodes_to_remove.append(q_reshape_node) - self.nodes_to_remove.append(q_split_node) - - self.nodes_to_remove.append(k_concat_node) - self.nodes_to_remove.append(k_transpose_node) - self.nodes_to_remove.append(k_reshape_node) - - self.nodes_to_remove.append(v_concat_node) - self.nodes_to_remove.append(v_transpose_node) - self.nodes_to_remove.append(v_reshape_node) - - else: - return \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py deleted file mode 100644 index e446a69a636ed38e6e869a15ba6196d727b6d855..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py +++ /dev/null @@ -1,413 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import List, Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -def get_tensor_attr(attrs, attr_name): - result = None - for i in attrs: - if i.name == attr_name: - return numpy_helper.to_array(i.t) - return result - - -class FusionSwinLAttention(Fusion): - """ - Fuse SwinL subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQKVToContextPluginDynamic_IxRT", - ["CustomFCPluginDynamic_IxRT"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size(self, reshape_v: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - v_shape = self.model.get_initializer(reshape_v.input[1]) - if v_shape is None: - logger.debug(f"{reshape_v.input[1]} is not initializer.") - return self.num_heads, self.hidden_size # Fall back to user specified value - - v_shape_value = NumpyHelper.to_array(v_shape) - if len(v_shape_value) != 3 or (v_shape_value[1] <= 0 or v_shape_value[2] <= 0): - logger.debug( - f"v_shape_value={v_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return self.num_heads, self.hidden_size # Fall back to user specified value - - num_heads = 1 - for value_info in self.model.graph().value_info: - if value_info.name == reshape_v.input[0]: - num_heads = value_info.type.tensor_type.shape.dim[2].dim_value - break - hidden_size = v_shape_value[2] - - return num_heads, hidden_size - - def create_attention_node( - self, - num_heads: int, - hidden_size: int, - inputs: List[str], - output: str, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - attention_node_name = self.model.create_node_name("Attention") - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", 1)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)]) - return attention_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - self.fuse_pattern1(normalize_node, input_name_to_nodes, output_name_to_node) - self.fuse_pattern2(normalize_node, input_name_to_nodes, output_name_to_node) - - def fuse_pattern2(self, normalize_node, input_name_to_nodes, output_name_to_node): - """match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC""" - logger.debug("fuse swin-L attention pass") - # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern - start_node = normalize_node - qkv_paths = { - "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]), - } - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - assert qkv_path == "path1", "abnormal qkv path" - reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes - - # 2. MatMul as start, go up to find v path - v_paths = { - "path1": ( - ["Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"], - [None, 0, 0], - ) - } - v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths) - if not v_nodes: - logger.debug("fuse_attention: failed to match v path") - return - assert v_path == "path1", "abnormal v path" - - # 3. MatMul as start, go up to find q,k paths - # q path - q_paths = { - "path1": ( - [ - "Softmax", - "Add", - "Div", - "MatMul", - "Transpose", - "Reshape", - "CustomFCPluginDynamic_IxRT", - ], - [None, 0, 0, 0, 0, 0, 0], - ), - } - q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths) - if not q_nodes: - logger.debug("fuse_attention: failed to match q path") - return - assert q_path == "path1", "abnormal q paths found" - - # get Add(bias) input name as fused Attention inputs - add_op, div_op = q_nodes[1], q_nodes[2] - relative_position_bias_name = ( - add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0] - ) - - # k path - k_paths = { - "path2": ( - [ - "Softmax", - "Add", - "Div", - "MatMul", - "Transpose", - "Reshape", - "CustomFCPluginDynamic_IxRT", - ], - [None, 0, 0, 0, 1, 0, 0], - ) - } - k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths) - if not k_nodes: - logger.debug("fuse_attention: failed to match k path") - return - assert k_path == "path2", "abnormal k paths found" - # 4. Fuse 3 CustomFC into one, and fuse attention - # Fuse FCs - fc_nodes = [q_nodes[-1], k_nodes[-1], v_nodes[-1]] - weight = self.fuse_tensor_in_node_attrs( - fc_nodes, "W", q_nodes[-1].name + "_Weight" - ) - bias = self.fuse_tensor_in_node_attrs(fc_nodes, "B", q_nodes[-1].name + "_Bias") - fused_node = helper.make_node( - "CustomFCPluginDynamic_IxRT", - inputs=[q_nodes[-1].input[0]], - outputs=q_nodes[-1].output, - name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend( - [helper.make_attribute("out_dims", numpy_helper.to_array(bias).shape[0])] - ) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - fused_node.attribute.extend([helper.make_attribute("W", weight)]) - fused_node.attribute.extend([helper.make_attribute("B", bias)]) - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("act_type", -1)]) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - self.nodes_to_add.append(fused_node) - - # Fuse Attention - num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv) - attention_node = self.create_attention_node( - num_heads, - hidden_size, - [fused_node.output[0], relative_position_bias_name], - reshape_qkv.output[0], - ) - if not attention_node: - return - self.nodes_to_add.append(attention_node) - self.node_name_to_graph_name[attention_node.name] = self.this_graph_name - self.nodes_to_remove.extend( - [*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes] - ) - self.prune_graph = True - - def fuse_pattern1(self, normalize_node, input_name_to_nodes, output_name_to_node): - """match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC""" - logger.debug("fuse swin-L attention pass") - # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern - start_node = normalize_node - qkv_paths = { - "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]), - } - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - assert qkv_path == "path1", "abnormal qkv path" - reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes - - # 2. MatMul as start, go up to find v path - v_paths = { - "path1": ( - ["Transpose", "Reshape", "Add", "Split", "MatMul"], - [None, 0, 0, None, 0], - ) - } - v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths) - if not v_nodes: - logger.debug("fuse_attention: failed to match v path") - return - assert v_path == "path1", "abnormal v path" - - # 3. MatMul as start, go up to find q,k paths - # q path - q_paths = { - "path1": ( - [ - "Softmax", - "Add", - "Div", - "MatMul", - "Transpose", - "Reshape", - "Add", - "Split", - "MatMul", - ], - [None, 0, 0, 0, 0, 0, 0, None, 0], - ), - } - q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths) - if not q_nodes: - logger.debug("fuse_attention: failed to match q path") - return - assert q_path == "path1", "abnormal q paths found" - - # get Add(bias) input name as fused Attention inputs - add_op, div_op = q_nodes[1], q_nodes[2] - relative_position_bias_name = ( - add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0] - ) - - # k path - k_paths = { - "path2": ( - [ - "Softmax", - "Add", - "Div", - "MatMul", - "Transpose", - "Reshape", - "Add", - "Split", - "MatMul", - ], - [None, 0, 0, 0, 1, 0, 0, None, 0], - ) - } - k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths) - if not k_nodes: - logger.debug("fuse_attention: failed to match k path") - return - assert k_path == "path2", "abnormal k paths found" - # 4. Attention and CustomFC have been found, now transform the found nodes to two plugin nodes - # Test 3 paths have the same origin - is_same_origin = q_nodes[-1] is k_nodes[-1] is v_nodes[-1] - is_same_origin &= q_nodes[-2] is k_nodes[-2] is v_nodes[-2] - is_same_origin &= q_nodes[-3] is not k_nodes[-2] is not v_nodes[-3] - if not is_same_origin: - print("swin-L fuse_attention: found qkv path but not has the same origin") - return - origin_matmul = q_nodes[-1] - fc_add = [q_nodes[-3], k_nodes[-3], v_nodes[-3]] - # Now fuse - num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv) - - # Fuse FC - weight = self.model.get_initializer(origin_matmul.input[1]) - biases = [self.model.get_initializer(i.input[0]) for i in fc_add] - if not weight or not all(biases): - print("swin-L: couldn't find weights") - return - weight_arr = onnx.numpy_helper.to_array(weight).transpose(1, 0) - weight.CopyFrom(numpy_helper.from_array(weight_arr)) - bias_arr = np.concatenate( - [onnx.numpy_helper.to_array(i) for i in biases], axis=0 - ) - - fused_node = helper.make_node( - "CustomFCPluginDynamic_IxRT", - inputs=[origin_matmul.input[0]], - outputs=fc_add[0].output, - name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"), - ) - fused_node.domain = "com.iluvatar" - fused_node.attribute.extend( - [helper.make_attribute("out_dims", bias_arr.shape[0])] - ) - fused_node.attribute.extend([helper.make_attribute("type_id", 2)]) - fused_node.attribute.extend([helper.make_attribute("W", weight)]) - fused_node.attribute.extend( - [helper.make_attribute("B", numpy_helper.from_array(bias_arr))] - ) - fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - fused_node.attribute.extend([helper.make_attribute("act_type", -1)]) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - self.nodes_to_add.append(fused_node) - # Fuse Attention - attention_node = self.create_attention_node( - num_heads, - hidden_size, - [fused_node.output[0], relative_position_bias_name], - reshape_qkv.output[0], - ) - if not attention_node: - return - self.nodes_to_add.append(attention_node) - self.node_name_to_graph_name[attention_node.name] = self.this_graph_name - self.nodes_to_remove.extend( - [*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes] - ) - self.prune_graph = True - - def fuse_tensor_in_node_attrs(self, fc_nodes, attr_name, tensor_name): - result = [get_tensor_attr(i.attribute, attr_name) for i in fc_nodes] - result = np.concatenate(result, axis=0) - result = numpy_helper.from_array(result, tensor_name) - return result diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py deleted file mode 100644 index bce0ab1713f20a19533e5793c4888607a7619c81..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py +++ /dev/null @@ -1,495 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionT5EncoderAttention(Fusion): - """ - Fuse T5Attention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQKVToContextPluginDynamic_IxRT", - ["CustomSkipLayerNormPluginDynamic_IxRT", "RMSNormPluginDynamic_IxRT"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape = self.model.get_initializer(reshape_q.input[1]) - if q_shape is None: - logger.debug(f"{reshape_q.input[1]} is not initializer.") - return [0, 0] - - q_shape_value = NumpyHelper.to_array(q_shape) - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug( - f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return [0, 0] - - num_heads = q_shape_value[2] - head_size = q_shape_value[3] - hidden_size = num_heads * head_size - - return num_heads, hidden_size - - def create_attention_node( - self, - num_heads: int, - hidden_size: int, - input: str, - output: str, - matmul_qk_add: NodeProto, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - attention_node_name = self.model.create_node_name("Attention") - - qk_bias = None - has_mask = 0 - has_qk_bias = 0 - add_input_is_value = False - if matmul_qk_add is not None: - has_qk_bias = 1 - qk_bias = self.model.get_initializer(matmul_qk_add.input[1]) - if qk_bias: - add_input_is_value = True - qk_bias_arr = NumpyHelper.to_array(qk_bias) - if len(qk_bias_arr.shape) == 3: - qk_bias_arr = qk_bias_arr.squeeze(0) - has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0) - if np.any(has_neg_inf): - qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype( - np.float32 - ) - qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name)) - - attention_inputs = [input] - - # 如果add的输入不是值,而是一个边,那么这个边的值需要cast到fp32 - cast_node = None - if not add_input_is_value: - cast_out_name = attention_node_name + "_fp32_in1" - cast_out_tensor = helper.make_tensor_value_info( - cast_out_name, TensorProto.FLOAT, [None, None, None, None] - ) - # self.model.add_initializer(cast_out_name) - cast_node = helper.make_node( - "Cast", - inputs=[matmul_qk_add.input[1]], - outputs=[cast_out_tensor.name], - name=self.model.create_node_name("Cast"), - to=1, - ) - self.node_name_to_graph_name[cast_node.name] = self.this_graph_name - attention_inputs.append(cast_out_name) - - if has_qk_bias: - if add_input_is_value: - has_mask = 1 - attention_inputs.append(qk_bias.name) - else: - has_mask = 1 - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend( - [helper.make_attribute("has_qk_bias", has_qk_bias)] - ) - attention_node.attribute.extend([helper.make_attribute("is_t5_mode", 1)]) - - return attention_node, cast_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - if normalize_node.op_type == "RMSNormPluginDynamic_IxRT": - add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) - if add_before_layernorm is not None: - start_node = add_before_layernorm - - # SkipLayerNormalization has two inputs, and one of them is the root input for attention. - qkv_paths = { - "path1": (["MatMul", "Reshape", "Transpose", "MatMul"], [0, 0, 0, 0]), - "path2": (["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0]), - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - - if qkv_path in ["path1", "path2"]: - (atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes - - other_inputs = [] - for i, input in enumerate(start_node.input): - if input not in output_name_to_node: - continue - - if input == qkv_nodes[0].output[0]: - continue - other_inputs.append(input) - if len(other_inputs) != 1: - return - - root_input = other_inputs[0] - """ - Match T5 - Add/Gather --> LayerNormalization --> Attention --> Add --> LayerNormalization - | | - | | - +--------------------------------------------------- - """ - transpose_before_layernorm = self.model.match_parent(start_node, "Gather", 0) - if transpose_before_layernorm is not None: - node_children = input_name_to_nodes[transpose_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT": - root_input = child.output[0] - - add_before_layernorm = self.model.match_parent(start_node, "Add", None) - if add_before_layernorm is not None: - node_children = input_name_to_nodes[add_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT": - root_input = child.output[0] - - v_paths = { - "path1": ( - ["Transpose", "Reshape", "Split", "MatMul"], - [1, 0, 0, None], - ) # T5 - } - - v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths) - if v_path == "path1": - (_, _, _, matmul_in_qkv) = v_nodes - - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - - qk_paths = { - "path1": (["Softmax", "MatMul"], [0, 0]), - "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]), - } - - qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths) - - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return - - matmul_qk_add = None - if qk_path == "path1": - (_, matmul_qk) = qk_nodes - else: - (_, matmul_qk_add, matmul_qk) = qk_nodes - - q_paths = {"path1": (["Transpose", "Reshape", "Split"], [0, 0, 0])} - q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - - if q_path == "path1": - (_, reshape_q, split_q) = q_nodes - # print(" split_q.name : ", split_q.name) - - k_paths = { - "path1": (["Transpose", "Reshape", "Split"], [1, 0, 0]), - } - k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths) - - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - - if k_path == "path1": - (_, _, split_k) = k_nodes - - if ( - matmul_in_qkv.input[0] == root_input - and split_q.input[0] == matmul_in_qkv.output[0] - and split_k.input[0] == matmul_in_qkv.output[0] - ): - attention_last_node = reshape_qkv - - num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q) - - new_node, new_cast_node = self.create_attention_node( - num_heads, - hidden_size, - matmul_in_qkv.output[0], - attention_last_node.output[0], - matmul_qk_add, - ) - if new_node is None: - return - - self.nodes_to_add.append(new_node) - if new_cast_node: - self.nodes_to_add.append(new_cast_node) - - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - self.nodes_to_remove.extend( - [attention_last_node, transpose_qkv, matmul_qkv] - ) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes) - self.nodes_to_remove.extend(k_nodes) - self.nodes_to_remove.extend(v_nodes[:-2]) - - -class FusionT5DecoderAttention(Fusion): - """ - Fuse T5Attention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQkvCrossToContext_IxRT", - ["Softmax"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape = self.model.get_initializer(reshape_q.input[1]) - if q_shape is None: - logger.debug(f"{reshape_q.input[1]} is not initializer.") - return [0, 0] - - q_shape_value = NumpyHelper.to_array(q_shape) - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug( - f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]." - ) - return [0, 0] - - num_heads = q_shape_value[2] - head_size = q_shape_value[3] - hidden_size = num_heads * head_size - - return num_heads, hidden_size - - def create_decoder_attention_node( - self, inputs: str, outputs: str, type_mask: int, has_mask: int - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - - attention_node_name = self.model.create_node_name("decoder_Attention") - attention_node = helper.make_node( - "CustomQkvCrossToContext_IxRT", - inputs=inputs, - outputs=outputs, - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("scale", 1.0)]) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)]) - - return attention_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - """ - path1: - - (query) ---------------->MatMul --> add -->softmax --->MatMul---> - / / / - (key) ---->Transpose --> / / - / / - (mask) ------------------------> / - / - (value)---------------------------------------------> - - - - path2: - - (query) ---------------->MatMul ---------->softmax --->MatMul---> - / / - (key) ---->Transpose --> / - / - / - / - (value)---------------------------------------------> - - """ - - start_node = node - qkv_paths = { - "path1": ( - ["Add", "MatMul", "Transpose"], - [0, 0, 0], - ), # float mask self attention,self attention key pass - "path2": (["MatMul", "Transpose"], [0, 0]), # cross attention qery pass - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - next_nodes = self.model.get_children(node) - if len(next_nodes) == 0: - return - - if next_nodes[0].op_type != "MatMul": - return - - second_matmul_node = next_nodes[0] - attention_inputs = None - attention_outputs = second_matmul_node.output - remove_nodes = [second_matmul_node, node] - if qkv_path == "path1": - (add_node, first_matmul_node, transpose_node) = qkv_nodes - transpose_nodes = self.model.get_parents(first_matmul_node) - q_input = transpose_nodes[0].output[0] - k_input = transpose_nodes[1].input[0] - v_input = second_matmul_node.input[1] - attention_inputs = [q_input, k_input, v_input] - remove_nodes.extend([add_node, first_matmul_node, transpose_nodes[1]]) - - if qkv_path == "path2": - (first_matmul_node, transpose_node) = qkv_nodes - transpose_nodes = self.model.get_parents(first_matmul_node) - q_input = transpose_nodes[0].output[0] - k_input = transpose_nodes[1].input[0] - v_input = second_matmul_node.input[1] - attention_inputs = [q_input, k_input, v_input] - remove_nodes.extend([first_matmul_node, transpose_nodes[1]]) - - has_mask = 0 - type_mask = 4 # int32 mask - - if qkv_path == "path1": - mask_input = add_node.input[0] - score_out = first_matmul_node.output[0] - if add_node.input[0] == score_out: - mask_input = add_node.input[1] - attention_inputs.append(mask_input) - has_mask = 1 - type_mask = 3 # float mask - - atten_node = self.create_decoder_attention_node( - attention_inputs, attention_outputs, type_mask, has_mask - ) - self.nodes_to_add.append(atten_node) - self.node_name_to_graph_name[atten_node.name] = self.this_graph_name - self.nodes_to_remove.extend(remove_nodes) \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py deleted file mode 100644 index 4765c8f51dbbf7b1f0da9e7821cc714665d1fbd8..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py +++ /dev/null @@ -1,276 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from logging import getLogger -from typing import Tuple - -import numpy -from numpy import array_equal, ndarray -from onnx import NodeProto, TensorProto, helper, numpy_helper -from onnx import onnx_pb as onnx_proto - -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionUtils: - def __init__(self, model: OnnxModel): - self.model: OnnxModel = model - - def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]: - graph_input = self.model.find_graph_input(input_name) - if ( - graph_input is not None - and graph_input.type.tensor_type.elem_type != TensorProto.INT32 - ): - cast_output, cast_node = self.cast_input_to_int32(input_name) - logger.debug(f"Casted graph input {input_name} to int32") - return True, cast_output - - logger.debug( - f"Did not cast graph input {input_name} to int32: found {graph_input is not None}" - ) - return False, input_name - - def cast_input_to_int32(self, input_name: str): - cast_output = input_name + "_int32" - - # Avoid consequent Cast nodes. - inputs = [input_name] - output_name_to_node = self.model.output_name_to_node() - if input_name in output_name_to_node: - parent_node = output_name_to_node[input_name] - if parent_node and parent_node.op_type == "Cast": - inputs = [parent_node.input[0]] - - cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output]) - cast_node.attribute.extend( - [helper.make_attribute("to", int(TensorProto.INT32))] - ) - self.model.add_node(cast_node) - - return cast_output, cast_node - - def remove_cast_int32(self, input_name: str): - input_name_to_nodes = self.model.input_name_to_nodes() - nodes = input_name_to_nodes[input_name] - for node in nodes: - if node.op_type == "Cast": - is_int32 = False - for att in node.attribute: - if att.name == "to" and att.i == int(TensorProto.INT32): - is_int32 = True - break - if is_int32: - output_name = node.output[0] - self.model.remove_node(node) - self.model.replace_input_of_all_nodes(output_name, input_name) - - @staticmethod - def check_node_attribute( - node, attribute_name: str, expected_value, default_value=None - ): - """Verify that a node has expected value for an attribute. - - Args: - node (NodeProto): a node to check - attribute_name (str): name of attribute - expected_value (Any): expected value of the attribute - default_value (Any, optional): default value if the attribute does not exist. Defaults to None. - - Returns: - bool: whether the check is passed or not - """ - value = default_value - for attr in node.attribute: - if attr.name == attribute_name: - value = helper.get_attribute_value(attr) - - if isinstance(expected_value, list): - return ( - isinstance(value, ndarray) or isinstance(value, list) - ) and array_equal(expected_value, value, equal_nan=False) - else: - return value == expected_value - - @staticmethod - def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto): - """Transpose a 2-D INT8 TensorProto - Args: - tensor (TensorProto): tensor to be transposed - Returns: - tensor (TensorProto): transposed tensor - """ - if not isinstance(tensor, onnx_proto.TensorProto): - raise ValueError( - "Expected input type is an ONNX TensorProto but got %s" % type(tensor) - ) - - if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8: - raise ValueError("Only INT8 2-D tensors can be transposed") - - if tensor.raw_data: - int32_data = numpy.reshape( - numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims - ) - int32_transposed_data = numpy.transpose(int32_data, [1, 0]) - tensor.raw_data = int32_transposed_data.tobytes() - - else: - raise ValueError("only raw buffer supported") - - return tensor - - @staticmethod - def check_qdq_node_for_fusion( - node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True - ): - """Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion. - It is a good candidate for fusion if: - (1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True` - (2) The Q/DQ node should have constant scale - (3) The Q/DQ node should have a zero point of 0 - Args: - node (NodeProto): a Q/DQ node to check - Returns: - bool: whether the check is passed or not - """ - if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}: - logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}") - - scale = model.get_constant_value(node.input[1]) - - # Scale is not constant - if scale is None: - return False - - # Not per-tensor quantization - scale_has_single_element = scale.ndim == 0 or ( - scale.ndim == 1 and scale.shape[0] == 1 - ) - if allow_per_tensor_quantization_only and not scale_has_single_element: - return False - - # If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec) - if len(node.input) == 2: - return True - - # Zero point should be constant and should have a value of 0 - zero_point = model.get_constant_value(node.input[2]) - - # Zero point and scale should have same number of dims - if scale.ndim != zero_point.ndim: - return False - - # Zero point is not constant or zero point is not zero - if zero_point is None: - return False - - return numpy.all(zero_point == 0) - - def check_node_input_value(self, node, input_index: int, expected_value): - """Verify that a node has expected input value - - Args: - node (NodeProto): a node to check - input_index (int): index of its input to be verified - expected_value (Any): expected value of the input - - Returns: - bool: whether the check is passed or not - """ - assert len(node.input) > input_index - - value = self.model.get_constant_value(node.input[input_index]) - - if isinstance(expected_value, list): - return ( - isinstance(value, ndarray) or isinstance(value, list) - ) and array_equal(expected_value, value, equal_nan=False) - else: - return value == expected_value - - def remove_identity_nodes(self): - """Remove Identity nodes, except those right before graph output.""" - nodes_to_remove = [] - for node in self.model.nodes(): - if node.op_type == "Identity": - if node.output[0] not in self.model.get_graphs_output_names(): - self.model.replace_input_of_all_nodes(node.output[0], node.input[0]) - nodes_to_remove.append(node) - - if nodes_to_remove: - self.model.remove_nodes(nodes_to_remove) - logger.info(f"Removed {len(nodes_to_remove)} Identity nodes") - - def remove_cascaded_cast_nodes(self): - self.model.remove_cascaded_cast_nodes() - - def remove_useless_cast_nodes(self): - self.model.remove_useless_cast_nodes() - - def remove_useless_reshape_nodes(self): - """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape""" - shape_infer = self.model.infer_runtime_shape(update=True) - if shape_infer is None: - return - - nodes_to_remove = [] - for node in self.model.nodes(): - if node.op_type == "Reshape": - input_shape = shape_infer.get_edge_shape(node.input[0]) - output_shape = shape_infer.get_edge_shape(node.output[0]) - if input_shape and output_shape and input_shape == output_shape: - logger.info( - f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}" - ) - nodes_to_remove.append(node) - - if nodes_to_remove: - graph_input_names = set(self.model.get_graphs_input_names()) - graph_output_names = set(self.model.get_graphs_output_names()) - for node in nodes_to_remove: - if bool(set(node.output) & graph_output_names): - if not bool(set(node.input) & graph_input_names): - self.model.replace_output_of_all_nodes( - node.input[0], node.output[0] - ) - else: - continue - else: - self.model.replace_input_of_all_nodes(node.output[0], node.input[0]) - self.model.remove_node(node) - - -class NumpyHelper: - @staticmethod - def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray: - # When weights are in external data format but not presented, we can still test the optimizer with two changes: - # (1) set fill_zeros = True (2) change load_external_data=False in optimizer.py - if fill_zeros: - from onnx import mapping - - return ndarray( - shape=tensor.dims, - dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type], - ) - - return numpy_helper.to_array(tensor) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py deleted file mode 100644 index d3244b7a609da3d8bfda6f91ed606259093e59c4..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py +++ /dev/null @@ -1,358 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionVideoBertAttention(Fusion): - """ - Fuse VideoBertAttention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQKVToContextPluginDynamic_IxRT", - ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size( - self, atten_matmul: NodeProto, div: NodeProto - ) -> Tuple[int, int]: - """Detect num_heads and hidden_size from a reshape node. - - Args: - reshape_q (NodeProto): reshape node for Q - - Returns: - Tuple[int, int]: num_heads and hidden_size - """ - - # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1]) - div_initializer = self.model.get_initializer(div.input[1]) - - # 检查float_data是否为空 - if len(div_initializer.float_data) > 0: - div_value = div_initializer.float_data[0] - else: - # 如果float_data为空,尝试其他方式获取数据 - # 例如,如果数据存储在raw_data中 - if len(div_initializer.raw_data) > 0: - dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type] - div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0] - else: - raise ValueError("Data not found in the div_initializer") - - atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape - head_dim = math.ceil(div_value * div_value) - hidden_size = atten_matul_shape_value[0] - num_heads = hidden_size // head_dim - - return num_heads, hidden_size - - def create_attention_node( - self, - num_heads: int, - hidden_size: int, - input: str, - output: str, - matmul_qk_add: NodeProto, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - attention_node_name = self.model.create_node_name("Attention") - - qk_bias = None - has_mask = 0 - has_qk_bias = 0 - if matmul_qk_add is not None: - has_qk_bias = 1 - qk_bias = self.model.get_initializer(matmul_qk_add.input[1]) - qk_bias_arr = NumpyHelper.to_array(qk_bias) - if len(qk_bias_arr.shape) == 3: - qk_bias_arr = qk_bias_arr.squeeze(0) - has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0) - if np.any(has_neg_inf): - qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype( - np.float32 - ) - qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name)) - - attention_inputs = [input] - - if qk_bias is not None: - has_mask = 1 - attention_inputs.append(qk_bias.name) - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend( - [helper.make_attribute("has_qk_bias", has_qk_bias)] - ) - - return attention_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - if normalize_node.op_type == "LayerNormalization": - add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) - if add_before_layernorm is not None: - start_node = add_before_layernorm - - # SkipLayerNormalization has two inputs, and one of them is the root input for attention. - qkv_paths = { - "path1": ( - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [0, None, 0, 0, 0], - ), - "path2": ( - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [1, None, 0, 0, 0], - ), - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - - if qkv_path in ["path1", "path2"]: - (_, atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes - - other_inputs = [] - for i, input in enumerate(start_node.input): - if input not in output_name_to_node: - continue - - if input == qkv_nodes[0].output[0]: - continue - other_inputs.append(input) - if len(other_inputs) != 1: - return - - root_input = other_inputs[0] - """ - Match videobert - transpose/Add --> LayerNormalization --> Attention --> Add --> LayerNormalization - | | - | | - +--------------------------------------------------------- - """ - transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0) - if transpose_before_layernorm is not None: - node_children = input_name_to_nodes[transpose_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "LayerNormalization": - root_input = child.output[0] - - add_before_layernorm = self.model.match_parent(start_node, "Add", None) - if add_before_layernorm is not None: - node_children = input_name_to_nodes[add_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "LayerNormalization": - root_input = child.output[0] - - v_paths = { - "path1": ( - ["Transpose", "Reshape", "Slice", "Add", "MatMul"], - [1, 0, 0, 0, None], - ) # videobert - } - - v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths) - if v_path == "path1": - (_, _, _, add_in_qkv, matmul_in_qkv) = v_nodes - - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - - qk_paths = { - "path1": (["Softmax", "MatMul"], [0, 0]), - "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]), - } - - qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths) - - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return - - matmul_qk_add = None - if qk_path == "path1": - (_, matmul_qk) = qk_nodes - else: - (_, matmul_qk_add, matmul_qk) = qk_nodes - - q_paths = { - "path1": (["Transpose", "Reshape", "Slice"], [0, 0, 0]), - "path2": (["Div", "Transpose", "Reshape", "Slice"], [0, 0, 0, 0]), - } - q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths) - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - - if q_path == "path1": - (_, _, slice_q) = q_nodes - else: - (div, _, _, slice_q) = q_nodes - - k_paths = { - "path1": (["Transpose", "Reshape", "Slice"], [1, 0, 0]), - "path2": (["Div", "Transpose", "Reshape", "Slice"], [1, 0, 0, 0]), - } - k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths) - - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - - if k_path == "path1": - (_, _, slice_k) = k_nodes - else: - (div, _, _, slice_k) = k_nodes - - if ( - matmul_in_qkv.input[0] == root_input - and slice_q.input[0] == add_in_qkv.output[0] - and slice_k.input[0] == add_in_qkv.output[0] - ): - attention_last_node = reshape_qkv - - num_heads, hidden_size = self.get_num_heads_and_hidden_size( - atten_matmul, div - ) - - new_node = self.create_attention_node( - num_heads, - hidden_size, - add_in_qkv.output[0], - attention_last_node.output[0], - matmul_qk_add, - ) - if new_node is None: - return - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - self.nodes_to_remove.extend( - [attention_last_node, transpose_qkv, matmul_qkv] - ) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes) - self.nodes_to_remove.extend(k_nodes) - self.nodes_to_remove.extend(v_nodes[:-2]) - - # fuse head and tail transpose - if transpose_before_layernorm is not None: - node_children = input_name_to_nodes[ - transpose_before_layernorm.output[0] - ] - for child in node_children: - for i, input in enumerate(child.input): - if child.input[i] == transpose_before_layernorm.output[0]: - child.input[i] = transpose_before_layernorm.input[0] - self.nodes_to_remove.extend([transpose_before_layernorm]) - - node = transpose_before_layernorm - while True: - found = False - node_children = input_name_to_nodes[node.output[0]] - for child in node_children: - if child is not None and child.op_type in [ - "SkipLayerNorm", - "Add", - ]: - node = child - found = True - break - if not found: - break - node_children = input_name_to_nodes[node.output[0]] - if len(node_children) == 1 and node_children[0].op_type == "Transpose": - transpose_node = node_children[0] - transpose_children = input_name_to_nodes[transpose_node.output[0]] - for i, input in enumerate(transpose_children[0].input): - if transpose_children[0].input[i] == transpose_node.output[0]: - transpose_children[0].input[i] = transpose_node.input[0] - self.nodes_to_remove.extend([transpose_node]) - # Use prune graph to remove mask nodes since they are shared by all attention nodes. - # self.nodes_to_remove.extend(mask_nodes) - # self.prune_graph = True diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py deleted file mode 100644 index f1a5410b62283e45f4f0a8957eaf7e83be6a6124..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py +++ /dev/null @@ -1,469 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import math -from typing import Dict -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import Tuple, Union - -import numpy as np -import onnx -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_options import AttentionMaskFormat -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel -from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto - -logger = getLogger(__name__) - - -class FusionVITAttention(Fusion): - """ - Fuse VITAttention subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__( - model, - "CustomQKVToContextPluginDynamic_IxRT", - ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"], - ) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def get_num_heads_and_hidden_size( - self, custom_fc: NodeProto, mul: NodeProto - ) -> Tuple[int, int]: - mul_initializer = self.model.get_initializer(mul.input[1]) - - # 检查float_data是否为空 - if len(mul_initializer.float_data) > 0: - mul_value = mul_initializer.float_data[0] - else: - # 如果float_data为空,尝试其他方式获取数据 - # 例如,如果数据存储在raw_data中 - if len(mul_initializer.raw_data) > 0: - dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type] - mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0] - else: - raise ValueError("Data not found in the mul_initializer") - - for attr in custom_fc.attribute: - if attr.name == "W": - tensor_value = attr.t - tensor_shape = [dim for dim in tensor_value.dims] - break - head_dim = math.floor(1.0 / (mul_value * mul_value)) * math.floor( - 1.0 / (mul_value * mul_value) - ) - hidden_size = tensor_shape[0] - num_heads = hidden_size // head_dim - - return num_heads, hidden_size - - def create_attention_node( - self, - num_heads: int, - hidden_size: int, - input: str, - output: str, - matmul_qk_add: NodeProto, - ) -> Union[NodeProto, None]: - """Create an Attention node. - - Args: - num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. - hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - assert num_heads > 0 - # print(hidden_size, num_heads) - if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug( - f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" - ) - return None - - attention_node_name = self.model.create_node_name("Attention") - - qk_bias = None - has_mask = 0 - has_qk_bias = 0 - if matmul_qk_add is not None: - has_qk_bias = 1 - qk_bias = self.model.get_initializer(matmul_qk_add.input[1]) - qk_bias_arr = NumpyHelper.to_array(qk_bias) - if len(qk_bias_arr.shape) == 3: - qk_bias_arr = qk_bias_arr.squeeze(0) - has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0) - if np.any(has_neg_inf): - qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype( - np.float32 - ) - qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name)) - - attention_inputs = [input] - - if qk_bias is not None: - has_mask = 1 - attention_inputs.append(qk_bias.name) - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=attention_inputs, - outputs=[output], - name=attention_node_name, - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend( - [helper.make_attribute("hidden_size", hidden_size)] - ) - attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend( - [helper.make_attribute("has_qk_bias", has_qk_bias)] - ) - - return attention_node - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm - # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern - start_node = normalize_node - if normalize_node.op_type == "LayerNormalization": - add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) - if add_before_layernorm is not None: - start_node = add_before_layernorm - - # SkipLayerNormalization has two inputs, and one of them is the root input for attention. - qkv_paths = { - "path1": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [0, 0, 0]), - "path2": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [1, 0, 0]), - } - - qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths) - - if qkv_nodes is None: - logger.debug("fuse_attention: failed to match qkv path") - return - - if qkv_path in ["path1", "path2"]: - (custom_fc_after_atten, transpose_qkv, matmul_qkv) = qkv_nodes - - other_inputs = [] - for i, input in enumerate(start_node.input): - if input not in output_name_to_node: - continue - - if input == qkv_nodes[0].output[0]: - continue - other_inputs.append(input) - if len(other_inputs) != 1: - return - - root_input = other_inputs[0] - """ - Match VIT - transpose --> LayerNormalization --> custom_fc -> attention -> Add - | | - | | - +------------------------------------------------------------------- - """ - transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0) - if transpose_before_layernorm is not None: - node_children = input_name_to_nodes[transpose_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "LayerNormalization": - root_input = child.output[0] - - add_before_layernorm = self.model.match_parent(start_node, "Add", None) - if add_before_layernorm is not None: - node_children = input_name_to_nodes[add_before_layernorm.output[0]] - for child in node_children: - if child is not None and child.op_type == "LayerNormalization": - root_input = child.output[0] - - # print("root_input: ", root_input, matmul_qkv.name) - v_paths = { - "path1": ( - [ - "Reshape", - "Transpose", - "Reshape", - "Gather", - "Squeeze", - "Transpose", - "Unsqueeze", - "Reshape", - "CustomFCPluginDynamic_IxRT", - ], - [1, 0, 0, 0, 0, 0, 0, 0, 0], - ) # vit - } - - v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths) - - squeeze_input = custom_fc = None - if v_path == "path1": - (_, _, _, _, squeeze_input, _, _, _, custom_fc) = v_nodes - - if v_nodes is None: - logger.debug("fuse_attention: failed to match v path") - return - - qk_paths = { - "path1": (["Softmax", "MatMul"], [0, 0]), - "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]), - } - - qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths) - # print("qk_nodes:", qk_nodes[1].name) - if qk_nodes is None: - logger.debug("fuse_attention: failed to match qk path") - return - - matmul_qk_add = None - if qk_path == "path1": - (_, matmul_qk) = qk_nodes - else: - (_, matmul_qk_add, matmul_qk) = qk_nodes - - q_paths = { - "path1": ( - ["Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze"], - [0, 0, 0, 0, 0, 0], - ), - } - q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths) - # print("q_nodes:", q_nodes[0].name) - squeeze_q = mul_q = None - if q_path == "path1": - squeeze_q = q_nodes[-1] - mul_q = q_nodes[0] - - if q_nodes is None: - logger.debug("fuse_attention: failed to match q path") - return - - k_paths = { - "path1": ( - [ - "Mul", - "Transpose", - "Reshape", - "Transpose", - "Reshape", - "Gather", - "Squeeze", - ], - [1, 0, 0, 0, 0, 0, 0], - ), - } - k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths) - # print("k_nodes:", k_nodes[0].name) - squeeze_k = None - if k_path == "path1": - squeeze_k = k_nodes[-1] - - if k_nodes is None: - logger.debug("fuse_attention: failed to match k path") - return - - if ( - custom_fc.input[0] == root_input - and squeeze_input == squeeze_q - and squeeze_input == squeeze_k - ): - attention_last_node = transpose_qkv - - num_heads, hidden_size = self.get_num_heads_and_hidden_size( - custom_fc_after_atten, mul_q - ) - - new_node = self.create_attention_node( - num_heads, - hidden_size, - custom_fc.output[0], - attention_last_node.output[0], - matmul_qk_add, - ) - if new_node is None: - return - - self.nodes_to_add.append(new_node) - self.node_name_to_graph_name[new_node.name] = self.this_graph_name - - self.nodes_to_remove.extend([transpose_qkv, matmul_qkv]) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(q_nodes[:-1]) - self.nodes_to_remove.extend(k_nodes[:-1]) - self.nodes_to_remove.extend(v_nodes[:-1]) - - # fuse head and tail transpose - if transpose_before_layernorm is not None: - node_children = input_name_to_nodes[ - transpose_before_layernorm.output[0] - ] - for child in node_children: - for i, input in enumerate(child.input): - if child.input[i] == transpose_before_layernorm.output[0]: - child.input[i] = transpose_before_layernorm.input[0] - self.nodes_to_remove.extend([transpose_before_layernorm]) - - node = transpose_before_layernorm - while True: - found = False - node_children = input_name_to_nodes[node.output[0]] - for child in node_children: - if child is not None and child.op_type in [ - "SkipLayerNorm", - "Add", - ]: - node = child - found = True - break - if not found: - break - node_children = input_name_to_nodes[node.output[0]] - if len(node_children) == 1 and node_children[0].op_type == "Transpose": - transpose_node = node_children[0] - transpose_children = input_name_to_nodes[transpose_node.output[0]] - for i, input in enumerate(transpose_children[0].input): - if transpose_children[0].input[i] == transpose_node.output[0]: - transpose_children[0].input[i] = transpose_node.input[0] - self.nodes_to_remove.extend([transpose_node]) - # Use prune graph to remove mask nodes since they are shared by all attention nodes. - # self.nodes_to_remove.extend(mask_nodes) - # self.prune_graph = True - - -class FusionTorchvisionVITAttention(Fusion): - """ - Fuse VITAttention subgraph into one Attention node. - """ - - def __init__(self, model: OnnxModel): - super().__init__( - model, "CustomQKVToContextPluginDynamic_IxRT", "CustomFCPluginDynamic_IxRT" - ) - - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): - """ - [Root] --> CustomFCPluginDynamic_IxRT--> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT - """ - children = self.model.get_children(node, input_name_to_nodes) - parent = self.model.get_parents(node, output_name_to_node) - - if len(children) != 1: - return - if len(parent) != 1: - return - - fc_first_node = None - for par in parent: - fc_first_node = self.model.find_first_parent_by_type( - par, "CustomFCPluginDynamic_IxRT", output_name_to_node, recursive=True - ) - if fc_first_node is not None: - break - if fc_first_node is None: - return - - start_node = node - - # v path - v_nodes = self.model.match_parent_path( - start_node, - ["Transpose", "MatMul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"], - [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], - output_name_to_node, - ) - - # path1, q and k path - q_nodes = self.model.match_parent_path( - start_node, - ["Transpose", "MatMul", "Softmax", "MatMul", "Mul", "Transpose", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"], - [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], - output_name_to_node, - ) - - k_nodes = self.model.match_parent_path( - start_node, - ["Transpose", "MatMul", "Softmax", "MatMul", "Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - output_name_to_node, - ) - - if v_nodes is None: - return - - if v_nodes and q_nodes and k_nodes: - subgraph_nodes = [] - subgraph_nodes.extend(q_nodes) - subgraph_nodes.extend(k_nodes) - subgraph_nodes.extend(v_nodes) - - subgraph_nodes_unique = [] - for item in subgraph_nodes: - if item not in subgraph_nodes_unique: - subgraph_nodes_unique.append(item) - - hidden_size = start_node.attribute[0].i - _, mul_val = self.model.get_constant_input(k_nodes[4]) - num_heads = hidden_size // (math.floor(1.0 / (mul_val * mul_val)) * math.floor(1.0 / (mul_val * mul_val))) - - attention_node = helper.make_node( - "CustomQKVToContextPluginDynamic_IxRT", - inputs=[fc_first_node.output[0]], - outputs=[start_node.input[0]], - name=self.model.create_node_name( - "TorchvisionVitAttention", name_prefix="TorchvisionVitAttention" - ), - ) - attention_node.domain = "com.iluvatar" - attention_node.attribute.extend([helper.make_attribute("type_id", 2)]) - attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) - attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)]) - attention_node.attribute.extend([helper.make_attribute("has_mask", 0)]) - attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 0)]) - - self.nodes_to_remove.extend(subgraph_nodes_unique) - - self.nodes_to_add.append(attention_node) - self.node_name_to_graph_name[attention_node.name] = self.this_graph_name \ No newline at end of file diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py deleted file mode 100644 index df55ba645988ddbffcd157e38db2c73ff34789a2..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -from logging import getLogger -from typing import Tuple, Union - -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -class FusionXSoftmax(Fusion): - """ - Fuse Where + Softmax + Where into one node: XSoftmax - """ - - def __init__(self, model: OnnxModel): - super().__init__(model, "XSoftmax_IxRT", "MatMul") - - def create_xsoftmax_node( - self, data_input: str, mask_input: str, output: str - ) -> Union[NodeProto, None]: - """Create an XSoftmax node. - - Args: - data_input (str): data input name - mask_input (str): max input name - output (str): output name - - Returns: - Union[NodeProto, None]: the node created or None if failed. - """ - xsoftmax_node_name = self.model.create_node_name("XSoftmax") - - xsoftmax_node = helper.make_node( - "XSoftmax_IxRT", - inputs=[data_input, mask_input], - outputs=[output], - name=xsoftmax_node_name, - ) - xsoftmax_node.domain = "com.iluvatar" - xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")]) - xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")]) - xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)]) - xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)]) - - return xsoftmax_node - - def fuse(self, node, input_name_to_nodes, output_name_to_node): - - xsoftmax_paths = { - "path": (["Where", "Softmax", "Where", "Add"], [None, None, None, None]), - } - xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict( - node, xsoftmax_paths - ) - - if xsoftmax_nodes is None: - logger.debug("fuse_xsoftmax: failed to match xsoftmax path") - return - else: - (tail_where, softmax, head_where, add) = xsoftmax_nodes - where_inputs = [i for i in tail_where.input if i in head_where.input] - assert len(where_inputs) == 1 - mask_input = where_inputs[0] - data_input = add.output[0] - data_output = tail_where.output[0] - - xsoftmax_node = self.create_xsoftmax_node( - data_input, mask_input, data_output - ) - - self.nodes_to_add.append(xsoftmax_node) - self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name - self.nodes_to_remove.append(tail_where) - self.nodes_to_remove.append(softmax) - self.nodes_to_remove.append(head_where) diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py deleted file mode 100644 index f2d07ce96d60c5e8fbfc749d1049bad471525239..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from enum import Enum -from logging import getLogger -from os import name -from sys import path -from typing import List, Tuple, Union - -import numpy as np -from onnx import NodeProto, TensorProto, helper, numpy_helper - -from .fusion_base import Fusion -from .fusion_utils import FusionUtils, NumpyHelper -from .onnx_model import OnnxModel - -logger = getLogger(__name__) - - -def get_tensor_attr(attrs, attr_name): - result = None - for i in attrs: - if i.name == attr_name: - return numpy_helper.to_array(i.t) - return result - - -class FusionYoloV5Decoder(Fusion): - """ - Fuse SwinL subgraph into one Attention node. - """ - - def __init__( - self, - model: OnnxModel, - ): - super().__init__(model, "YoloV5Decoder", ["Reshape"]) - - # Flags to show warning only once - self.num_heads_warning = True - self.hidden_size_warning = True - - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - short_path = ["Concat", "Slice", "Sigmoid", "Transpose", "Reshape"] - paths = [ - (["Concat", "Unsqueeze", "Gather", "Shape"], [1] + [None] * 3), - ( - ["Concat", "Mul", "Add", "Sub", "Mul", "Slice", "Sigmoid", "Transpose"], - [0, 0] + [None] * 6, - ), - ( - ["Concat", "Mul", "Pow", "Mul", "Slice", "Sigmoid", "Transpose"], - [0, 1] + [None] * 5, - ), - (short_path, [None] * 5), - (short_path + ["Concat", "Unsqueeze", "Gather", "Shape"], [None] * 9), - ] - paths_found = [] - nodes_names_found = set() - nodes_found = [] - for path_i in paths: - nodes = self.model.match_parent_path(normalize_node, path_i[0], path_i[1]) - paths_found.append(nodes) - if nodes: - for n in nodes: - if n.name not in nodes_names_found: - nodes_names_found.add(n.name) - nodes_found.append(n) - if not all(paths_found): - return - shape_node = paths_found[-1][-1] - params = self._find_yolov5_decoder_params(paths_found) - self._fuse_node( - inputs=shape_node.input, outputs=normalize_node.output, params=params - ) - self.nodes_to_remove.extend(nodes_found) - self._delete_extra_output_edges(paths_found) - self.prune_graph = True - - def _fuse_node(self, inputs, outputs, params): - fused_node = helper.make_node( - "YoloV5Decoder", - inputs=inputs, - outputs=outputs, - name=self.model.create_node_name("YoloV5Decoder"), - ) - fused_node.attribute.extend(params) - self.nodes_to_add.append(fused_node) - self.node_name_to_graph_name[fused_node.name] = self.this_graph_name - - def _delete_extra_output_edges(self, paths_found): - transpose_node = paths_found[2][-1] - assert transpose_node.op_type == "Transpose" - out_edge = transpose_node.output[0] - for item in self.model.graph().output: - if item.name == out_edge: - self.model.graph().output.remove(item) - logger.warning(f"Output: {out_edge} is useless in graph, delete it") - return - - def _find_yolov5_decoder_params(self, paths_found): - # num_class - concat_op = paths_found[0][0] - assert concat_op.op_type == "Concat" - num_class_arr = self.model.get_initializer(concat_op.input[2], True) - assert num_class_arr - num_class = (num_class_arr - 5).tolist()[0] - num_class = helper.make_attribute("num_class", num_class) - - # stride - mul_op = paths_found[1][1] - assert mul_op.op_type == "Mul" - input_arrs = self.model.get_initializer_input_edges(mul_op.name, True) - assert len(input_arrs) == 1 - stride = input_arrs[0].tolist() - stride = helper.make_attribute("stride", stride) - - # anchor - mul_op = paths_found[2][1] - assert mul_op.op_type == "Mul" - anchor = self.model.get_initializer_input_edges(mul_op.name, True) - assert len(anchor) == 1 - anchor = anchor[0] - anchor = anchor[0, :, 0, 0, :] if len(anchor.shape) == 5 else anchor[:, 0, 0, :] - anchor = helper.make_attribute("anchor", list(anchor.flatten())) - - # fast_impl - fast_impl = helper.make_attribute("faster_impl", 1) - - return [num_class, stride, anchor, fast_impl] diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py deleted file mode 100644 index 0b76f660fce62ec0aa19b8c132a6ba51cf6fe319..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py +++ /dev/null @@ -1,1182 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -import logging -import os -import sys -from collections import deque -from pathlib import Path -from typing import Dict, List, Optional, Tuple - -from onnx import ( - AttributeProto, - GraphProto, - ModelProto, - NodeProto, - TensorProto, - helper, - numpy_helper, - save_model, -) - -from .float16 import convert_float_to_float16 -from .shape_infer_helper import SymbolicShapeInferenceHelper - -logger = logging.getLogger(__name__) - - -class OnnxModel: - def __init__(self, model): - self.initialize(model) - self.initializer_visited: Dict[str, bool] = {} - - def initialize(self, model): - self.model: ModelProto = model - self._node_name_suffix: Dict[ - str, int - ] = {} # key is node name prefix, value is the last suffix generated - self.shape_infer_helper: SymbolicShapeInferenceHelper = None - self.enable_shape_infer: bool = True - self.all_graphs: Optional[List[GraphProto]] = None - - def disable_shape_inference(self): - self.enable_shape_infer = False - - def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False): - if self.enable_shape_infer: - if self.shape_infer_helper is None or update: - self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model) - - try: - if self.shape_infer_helper.infer(dynamic_axis_mapping): - return self.shape_infer_helper - except: - self.enable_shape_infer = ( - False # disable shape inference to suppress same error message. - ) - print("failed in shape inference", sys.exc_info()[0]) - - return None - - def input_name_to_nodes(self): - input_name_to_nodes = {} - for node in self.nodes(): - for input_name in node.input: - if input_name not in input_name_to_nodes: - input_name_to_nodes[input_name] = [node] - else: - input_name_to_nodes[input_name].append(node) - return input_name_to_nodes - - def output_name_to_node(self): - output_name_to_node = {} - for node in self.nodes(): - for output_name in node.output: - output_name_to_node[output_name] = node - return output_name_to_node - - def nodes(self): - all_nodes = [] - for graph in self.graphs(): - for node in graph.node: - all_nodes.append(node) - return all_nodes - - def graph(self): - return self.model.graph - - def graphs(self): - if self.all_graphs is not None: - return self.all_graphs - self.all_graphs = [] - graph_queue = [self.model.graph] - while graph_queue: - graph = graph_queue.pop(0) - self.all_graphs.append(graph) - for node in graph.node: - for attr in node.attribute: - if attr.type == AttributeProto.AttributeType.GRAPH: - assert isinstance(attr.g, GraphProto) - graph_queue.append(attr.g) - if attr.type == AttributeProto.AttributeType.GRAPHS: - for g in attr.graphs: - assert isinstance(g, GraphProto) - graph_queue.append(g) - return self.all_graphs - - def get_graphs_input_names(self): - input_names = [] - for graph in self.graphs(): - for input in graph.input: - input_names.append(input.name) - return input_names - - def get_graphs_output_names(self): - output_names = [] - for graph in self.graphs(): - for output in graph.output: - output_names.append(output.name) - return output_names - - def get_graph_by_node(self, node): - for graph in self.graphs(): - if node in graph.node: - return graph - return None - - def get_graph_by_name(self, graph_name): - for graph in self.graphs(): - if graph_name == graph.name: - return graph - return None - - def get_topological_insert_id(self, graph, outputs): - for idx, node in enumerate(graph.node): - for input in node.input: - if input in outputs: - return idx - return len(graph.node) - - def remove_node(self, node): - for graph in self.graphs(): - if node in graph.node: - graph.node.remove(node) - - def remove_nodes(self, nodes_to_remove): - for node in nodes_to_remove: - self.remove_node(node) - - def add_node(self, node, graph_name=None): - if graph_name is None or graph_name == self.model.graph.name: - self.model.graph.node.extend([node]) - else: - graph = self.get_graph_by_name(graph_name) - insert_idx = self.get_topological_insert_id(graph, node.output) - graph.node.insert(insert_idx, node) - - def add_nodes(self, nodes_to_add, node_name_to_graph_name=None): - if node_name_to_graph_name is None: - self.model.graph.node.extend(nodes_to_add) - else: - for node in nodes_to_add: - graph_name = node_name_to_graph_name[node.name] - self.add_node(node, graph_name) - - def add_initializer(self, tensor, graph_name=None): - if graph_name is None or graph_name == self.model.graph.name: - self.model.graph.initializer.extend([tensor]) - else: - graph = self.get_graph_by_name(graph_name) - graph.initializer.extend([tensor]) - - def add_input(self, input, graph_name=None): - if graph_name is None or graph_name == self.model.graph.name: - self.model.graph.input.extend([input]) - else: - graph = self.get_graph_by_name(graph_name) - graph.input.extend([input]) - - @staticmethod - def replace_node_input(node, old_input_name, new_input_name): - assert isinstance(old_input_name, str) and isinstance(new_input_name, str) - for j in range(len(node.input)): - if node.input[j] == old_input_name: - node.input[j] = new_input_name - - def replace_input_of_all_nodes(self, old_input_name, new_input_name): - for node in self.model.graph.node: - OnnxModel.replace_node_input(node, old_input_name, new_input_name) - - @staticmethod - def replace_node_output(node, old_output_name, new_output_name): - assert isinstance(old_output_name, str) and isinstance(new_output_name, str) - for j in range(len(node.output)): - if node.output[j] == old_output_name: - node.output[j] = new_output_name - - def replace_output_of_all_nodes(self, old_output_name, new_output_name): - for node in self.model.graph.node: - OnnxModel.replace_node_output(node, old_output_name, new_output_name) - - def get_initializer(self, name, return_np_array=False): - for graph in self.graphs(): - for tensor in graph.initializer: - if tensor.name == name: - return numpy_helper.to_array(tensor) if return_np_array else tensor - return None - - def get_node(self, op_name): - for graph in self.graphs(): - for n in graph.node: - if n.name == op_name: - return n - return None - - def get_initializer_input_edges(self, op_name, return_np_array=False): - initializers = {i.name: i for graph in self.graphs() for i in graph.initializer} - node = self.get_node(op_name) - assert node - result = [] - for i in node.input: - if i in initializers: - tensor = initializers[i] - tensor = numpy_helper.to_array(tensor) if return_np_array else tensor - result.append(tensor) - return result - - def get_nodes_by_op_type(self, op_type): - nodes = [] - for node in self.nodes(): - if node.op_type == op_type: - nodes.append(node) - return nodes - - def get_children(self, node, input_name_to_nodes=None): - if input_name_to_nodes is None: - input_name_to_nodes = self.input_name_to_nodes() - - children = [] - for output in node.output: - if output in input_name_to_nodes: - for node in input_name_to_nodes[output]: - children.append(node) - return children - - def get_parents(self, node, output_name_to_node=None): - if output_name_to_node is None: - output_name_to_node = self.output_name_to_node() - - parents = [] - for input in node.input: - if input in output_name_to_node: - parents.append(output_name_to_node[input]) - return parents - - def get_parent(self, node, i, output_name_to_node=None): - if output_name_to_node is None: - output_name_to_node = self.output_name_to_node() - - if len(node.input) <= i: - return None - - input = node.input[i] - if input not in output_name_to_node: - return None - - return output_name_to_node[input] - - def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]): - """ - Find parent node based on constraints on op_type. - - Args: - node (str): current node name. - parent_op_type (str): constraint of parent node op_type. - output_name_to_node (dict): dictionary with output name as key, and node as value. - exclude (list): list of nodes that are excluded (not allowed to match as parent). - - Returns: - parent: The matched parent node. None if not found. - index: The input index of matched parent node. None if not found. - """ - for i, input in enumerate(node.input): - if input in output_name_to_node: - parent = output_name_to_node[input] - if parent.op_type == parent_op_type and parent not in exclude: - return parent, i - else: - logger.debug( - f"To find first {parent_op_type}, current {parent.op_type}" - ) - return None, None - - def match_parent( - self, - node, - parent_op_type, - input_index=None, - output_name_to_node=None, - exclude=[], - return_indice=None, - ): - """ - Find parent node based on constraints on op_type and index. - When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index. - - Args: - node (str): current node name. - parent_op_type (str): constraint of parent node op_type. - input_index (int or None): only check the parent given input index of current node. - output_name_to_node (dict): dictionary with output name as key, and node as value. - exclude (list): list of nodes that are excluded (not allowed to match as parent). - return_indice (list): a list to append the input index when input_index is None. - - Returns: - parent: The matched parent node. - """ - assert node is not None - assert input_index is None or input_index >= 0 - - if output_name_to_node is None: - output_name_to_node = self.output_name_to_node() - - if input_index is None: - parent, index = self.match_first_parent( - node, parent_op_type, output_name_to_node, exclude - ) - if return_indice is not None: - return_indice.append(index) - return parent - - if input_index >= len(node.input): - logger.debug(f"input_index {input_index} >= node inputs {len(node.input)}") - return None - - parent = self.get_parent(node, input_index, output_name_to_node) - if ( - parent is not None - and parent.op_type == parent_op_type - and parent not in exclude - ): - return parent - - if parent is not None: - logger.debug(f"Expect {parent_op_type}, Got {parent.op_type}") - - return None - - def match_parent_paths(self, node, paths, output_name_to_node): - for i, path in enumerate(paths): - assert isinstance(path, List) or isinstance(path, Tuple) - return_indice = [] - matched = self.match_parent_path( - node, path[0], path[1], output_name_to_node, return_indice - ) - if matched: - return i, matched, return_indice - return -1, None, None - - def match_parent_path( - self, - node, - parent_op_types, - parent_input_index, - output_name_to_node=None, - return_indice=None, - ): - """ - Find a sequence of input edges based on constraints on parent op_type and index. - When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index. - - Args: - node (str): current node name. - parent_op_types (str): constraint of parent node op_type of each input edge. - parent_input_index (list): constraint of input index of each input edge. None means no constraint. - output_name_to_node (dict): dictionary with output name as key, and node as value. - return_indice (list): a list to append the input index when there is no constraint on input index of an edge. - - Returns: - parents: a list of matched parent node. - """ - assert len(parent_input_index) == len(parent_op_types) - - if output_name_to_node is None: - output_name_to_node = self.output_name_to_node() - - current_node = node - matched_parents = [] - for i, op_type in enumerate(parent_op_types): - matched_parent = self.match_parent( - current_node, - op_type, - parent_input_index[i], - output_name_to_node, - exclude=[], - return_indice=return_indice, - ) - if matched_parent is None: - logger.debug( - f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}", - stack_info=True, - ) - return None - - matched_parents.append(matched_parent) - current_node = matched_parent - - return matched_parents - - def find_first_child_by_type( - self, node, child_type, input_name_to_nodes=None, recursive=True - ): - children = self.get_children(node, input_name_to_nodes) - dq = deque(children) - while len(dq) > 0: - current_node = dq.pop() - if current_node.op_type == child_type: - return current_node - - if recursive: - children = self.get_children(current_node, input_name_to_nodes) - for child in children: - dq.appendleft(child) - - return None - - def find_first_parent_by_type( - self, node, parent_type, output_name_to_node=None, recursive=True - ): - if output_name_to_node is None: - output_name_to_node = self.output_name_to_node() - - parents = self.get_parents(node, output_name_to_node) - dq = deque(parents) - while len(dq) > 0: - current_node = dq.pop() - if current_node.op_type == parent_type: - return current_node - - if recursive: - parents = self.get_parents(current_node, output_name_to_node) - for parent in parents: - dq.appendleft(parent) - - return None - - def get_constant_value(self, output_name): - for node in self.get_nodes_by_op_type("Constant"): - if node.output[0] == output_name: - for att in node.attribute: - if att.name == "value": - return numpy_helper.to_array(att.t) - - # Fall back to intializer since constant folding might have been applied. - initializer = self.get_initializer(output_name) - if initializer is not None: - return numpy_helper.to_array(initializer) - - return None - - def get_constant_input(self, node): - for i, input in enumerate(node.input): - value = self.get_constant_value(input) - if value is not None: - return i, value - - return None, None - - def find_constant_input(self, node, expected_value, delta=0.000001): - i, value = self.get_constant_input(node) - if ( - value is not None - and value.size == 1 - and abs(value - expected_value) < delta - ): - return i - - return -1 - - def is_constant_with_specified_dimension( - self, output_name, dimensions, description - ): - value = self.get_constant_value(output_name) - if value is None: - logger.debug(f"{description} {output_name} is not initializer.") - return False - - if len(value.shape) != dimensions: - logger.debug( - f"{description} {output_name} shall have {dimensions} dimensions. Got shape {value.shape}" - ) - return False - - return True - - def has_constant_input(self, node, expected_value, delta=0.000001): - return self.find_constant_input(node, expected_value, delta) >= 0 - - def get_children_subgraph_nodes( - self, root_node, stop_nodes, input_name_to_nodes=None - ): - if input_name_to_nodes is None: - input_name_to_nodes = self.input_name_to_nodes() - - children = input_name_to_nodes[root_node.output[0]] - - unique_nodes = [] - - dq = deque(children) - while len(dq) > 0: - current_node = dq.pop() - if current_node in stop_nodes: - continue - - if current_node not in unique_nodes: - unique_nodes.append(current_node) - - for output in current_node.output: - if output in input_name_to_nodes: - children = input_name_to_nodes[output] - for child in children: - dq.appendleft(child) - - return unique_nodes - - def tensor_shape_to_list(self, tensor_type): - """Convert tensor shape to list""" - shape_list = [] - for d in tensor_type.shape.dim: - if d.HasField("dim_value"): - shape_list.append(d.dim_value) # known dimension - elif d.HasField("dim_param"): - shape_list.append(d.dim_param) # unknown dimension with symbolic name - else: - shape_list.append("?") # shall not happen - return shape_list - - def get_dtype(self, input_or_output: str): - """Try get data type given a name (could be initializer, graph input or output).""" - tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info} - - if input_or_output in tensor_type_map: - return tensor_type_map[input_or_output].tensor_type.elem_type - - graph_input = self.find_graph_input(input_or_output) - if graph_input: - return graph_input.type.tensor_type.elem_type - - graph_output = self.find_graph_output(input_or_output) - if graph_output: - return graph_output.type.tensor_type.elem_type - - return None - - @staticmethod - def get_node_attribute(node: NodeProto, attribute_name: str): - for attr in node.attribute: - if attr.name == attribute_name: - value = helper.get_attribute_value(attr) - return value - return None - - def remove_cascaded_cast_nodes(self): - """Remove Cast node that are followed by another Cast node like --> Cast --> Cast --> - Note that this shall be used carefully since it might introduce semantic change. - For example, float -> int -> float could get different value than the original float value. - So, it is recommended to used only in post-processing of mixed precision conversion. - """ - output_name_to_node = self.output_name_to_node() - removed_count = 0 - for node in self.nodes(): - if node.op_type == "Cast": - parent = self.get_parent( - node, 0, output_name_to_node=output_name_to_node - ) - if parent and parent.op_type == "Cast": - node.input[0] = parent.input[0] - removed_count += 1 - - if removed_count > 0: - logger.info("Removed %d cascaded Cast nodes", removed_count) - self.prune_graph() - - def remove_useless_cast_nodes(self): - """Remove cast nodes that are not needed: input and output has same data type.""" - shape_infer = self.infer_runtime_shape(update=True) - if shape_infer is None: - logger.info( - f"Skip removing useless cast nodes since shape inference failed." - ) - return - - def get_data_type(input_or_output_name): - dtype = self.get_dtype(input_or_output_name) - if dtype: - return dtype - if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField( - "elem_type" - ): - return shape_infer.known_vi_[ - input_or_output_name - ].type.tensor_type.elem_type - return None - - nodes_to_remove = [] - for node in self.nodes(): - if node.op_type == "Cast": - input_dtype = get_data_type(node.input[0]) - output_dtype = get_data_type(node.output[0]) - if input_dtype and input_dtype == output_dtype: - nodes_to_remove.append(node) - - if nodes_to_remove: - graph_input_names = set(self.get_graphs_input_names()) - graph_output_names = set(self.get_graphs_output_names()) - for node in nodes_to_remove: - if bool(set(node.output) & graph_output_names): - if not bool(set(node.input) & graph_input_names): - self.replace_output_of_all_nodes(node.input[0], node.output[0]) - else: - continue - else: - self.replace_input_of_all_nodes(node.output[0], node.input[0]) - self.remove_node(node) - - logger.info( - "Removed %d Cast nodes with output type same as input", - len(nodes_to_remove), - ) - - def convert_model_float32_to_float16(self, cast_input_output=True): - logger.warning( - "The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!" - ) - self.convert_float_to_float16( - use_symbolic_shape_infer=True, keep_io_types=cast_input_output - ) - - def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs): - """Convert a model to half (default) or mixed precision. - To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32. - By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used. - Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information. - - Args: - use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True. - keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names. - If True, model inputs/outputs should be left as float32. Defaults to False. - op_block_list (List[str], optional): List of operator types to leave as float32. - Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default. - node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None. - force_fp16_initializers(bool): force converting all float initializers to float16. - Default to false, which will convert only the one needed to avoid precision loss. - min_positive_val (float, optional): minimal positive value. Defaults to 1e-7. - max_finite_val (float, optional): maximal finite value. Defaults to 1e4. - """ - if "keep_io_types" not in kwargs: - kwargs["keep_io_types"] = True - - model = self.model - if use_symbolic_shape_infer: - # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference. - shape_infer_helper = SymbolicShapeInferenceHelper(model) - model = shape_infer_helper.infer_shapes( - model, auto_merge=True, guess_output_rank=False - ) - - parameters = {"disable_shape_infer": use_symbolic_shape_infer} - parameters.update( - { - key: kwargs[key] - for key in [ - "keep_io_types", - "min_positive_val", - "max_finite_val", - "op_block_list", - "node_block_list", - "force_fp16_initializers", - ] - if key in kwargs - } - ) - - fp16_model = convert_float_to_float16(model, **parameters) - self.initialize(fp16_model) - - self.remove_cascaded_cast_nodes() - - self.remove_useless_cast_nodes() - - def create_node_name(self, op_type, name_prefix=None): - """Create a unique node name that starts with a prefix (default is operator type). - The name will not be duplicated with any name that generated or existed in current graphs. - Args: - op_type (str): operator type - name_prefix (str, optional): prefix of node name. Defaults to None. - - Returns: - str: node name - """ - - if name_prefix: - prefix = name_prefix if name_prefix.endswith("_") else (name_prefix + "_") - else: - prefix = op_type + "_" - - suffix: int = 0 - if prefix in self._node_name_suffix: - suffix = self._node_name_suffix[prefix] + 1 - else: - # Check existed node name only once for a prefix as we assume create_node_name is called for every new node in fusion. - for node in self.nodes(): - if node.name and node.name.startswith(prefix): - try: - index = int(node.name[len(prefix) :]) - suffix = max(index + 1, suffix) - except ValueError: - continue - - # Record the generated suffix so that we can avoid generating duplicated name. - self._node_name_suffix[prefix] = suffix - - return prefix + str(suffix) - - def find_graph_input(self, input_name): - for input in self.model.graph.input: - if input.name == input_name: - return input - return None - - def find_graph_output(self, output_name): - for output in self.model.graph.output: - if output.name == output_name: - return output - return None - - def get_parent_subgraph_nodes(self, node, stop_nodes, output_name_to_node=None): - if output_name_to_node is None: - output_name_to_node = self.output_name_to_node() - - unique_nodes = [] - - parents = self.get_parents(node, output_name_to_node) - dq = deque(parents) - while len(dq) > 0: - current_node = dq.pop() - if current_node in stop_nodes: - continue - - if current_node not in unique_nodes: - unique_nodes.append(current_node) - - for input in current_node.input: - if input in output_name_to_node: - dq.appendleft(output_name_to_node[input]) - - return unique_nodes - - def get_graph_inputs(self, current_node, recursive=False): - """ - Find graph inputs that linked to current node. - """ - graph_inputs = [] - for input in current_node.input: - if self.find_graph_input(input) and input not in graph_inputs: - graph_inputs.append(input) - - if recursive: - parent_nodes = self.get_parent_subgraph_nodes(current_node, []) - for node in parent_nodes: - for input in node.input: - if self.find_graph_input(input) and input not in graph_inputs: - graph_inputs.append(input) - return graph_inputs - - @staticmethod - def input_index(node_output, child_node): - index = 0 - for input in child_node.input: - if input == node_output: - return index - index += 1 - return -1 - - def remove_unused_constant(self): - input_name_to_nodes = self.input_name_to_nodes() - - # remove unused constant - unused_nodes = [] - nodes = self.nodes() - for node in nodes: - if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes: - unused_nodes.append(node) - - self.remove_nodes(unused_nodes) - - if len(unused_nodes) > 0: - logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}") - - def prune_graph(self, outputs=None): - """ - Prune graph to keep only required outputs. It removes unnecessary inputs and nodes. - Nodes are not linked (directly or indirectly) to any required output will be removed. - - Args: - outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept. - """ - if len(self.graphs()) > 1: - logger.debug(f"Skip prune_graph since graph has subgraph") - return - - if outputs is None: - outputs = [output.name for output in self.model.graph.output] - - output_name_to_node = self.output_name_to_node() - all_nodes = [] - for output in outputs: - if output in output_name_to_node: - last_node = output_name_to_node[output] - if last_node in all_nodes: - continue - nodes = self.get_parent_subgraph_nodes(last_node, []) - all_nodes.append(last_node) - all_nodes.extend(nodes) - - nodes_to_remove = [] - for node in self.model.graph.node: - if node not in all_nodes: - nodes_to_remove.append(node) - - self.remove_nodes(nodes_to_remove) - - # remove outputs not in list - output_to_remove = [] - for output in self.model.graph.output: - if output.name not in outputs: - output_to_remove.append(output) - for output in output_to_remove: - self.model.graph.output.remove(output) - - # remove inputs not used by any node. - input_name_to_nodes = self.input_name_to_nodes() - input_to_remove = [] - for input in self.model.graph.input: - if input.name not in input_name_to_nodes: - input_to_remove.append(input) - for input in input_to_remove: - self.model.graph.input.remove(input) - - if input_to_remove or output_to_remove or nodes_to_remove: - logger.info( - "Graph pruned: {} inputs, {} outputs and {} nodes are removed".format( - len(input_to_remove), len(output_to_remove), len(nodes_to_remove) - ) - ) - - self.update_graph() - - def update_graph(self, verbose=False): - graph = self.model.graph - - remaining_input_names = [] - for node in graph.node: - if node.op_type in ["Loop", "Scan", "If"]: - # TODO: handle inner graph - logger.debug( - f"Skip update_graph since graph has operator: {node.op_type}" - ) - return - if node.op_type != "Constant": - for input_name in node.input: - if input_name not in remaining_input_names: - remaining_input_names.append(input_name) - if verbose: - logger.debug(f"remaining input names: {remaining_input_names}") - - # remove graph input that is not used - inputs_to_remove = [] - for input in graph.input: - if input.name not in remaining_input_names: - inputs_to_remove.append(input) - for input in inputs_to_remove: - graph.input.remove(input) - - names_to_remove = [input.name for input in inputs_to_remove] - logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}") - - # remove weights that are not used - weights_to_remove = [] - weights_to_keep = [] - for initializer in graph.initializer: - if ( - initializer.name not in remaining_input_names - and not self.find_graph_output(initializer.name) - ): - weights_to_remove.append(initializer) - else: - weights_to_keep.append(initializer.name) - for initializer in weights_to_remove: - graph.initializer.remove(initializer) - - names_to_remove = [initializer.name for initializer in weights_to_remove] - logger.debug( - f"remove {len(weights_to_remove)} unused initializers: {names_to_remove}" - ) - if verbose: - logger.debug(f"remaining initializers:{weights_to_keep}") - - self.remove_unused_constant() - - def is_safe_to_fuse_nodes( - self, nodes_to_remove, keep_outputs, input_name_to_nodes, output_name_to_node - ): - for node_to_remove in nodes_to_remove: - for output_to_remove in node_to_remove.output: - if output_to_remove in keep_outputs: - continue - - if output_to_remove in input_name_to_nodes: - for impacted_node in input_name_to_nodes[output_to_remove]: - if impacted_node not in nodes_to_remove: - logger.debug( - f"it is not safe to remove nodes since output {output_to_remove} is used by {impacted_node}" - ) - return False - return True - - @staticmethod - def graph_topological_sort(graph): - deps_count = [0] * len(graph.node) # dependency count of each node - deps_to_nodes = {} # input to node indice - sorted_nodes = [] # initialize sorted_nodes - for node_idx, node in enumerate(graph.node): - # CANNOT use len(node.input) directly because input can be optional - deps_count[node_idx] = sum(1 for _ in node.input if _) - if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs - sorted_nodes.append(graph.node[node_idx]) - continue - - for input_name in node.input: - if input_name not in deps_to_nodes: - deps_to_nodes[input_name] = [node_idx] - else: - deps_to_nodes[input_name].append(node_idx) - - # Note: this logic only applies to top level graph since a sub graph could use intializer from parent graph - initializer_names = [init.name for init in graph.initializer] - graph_input_names = [input.name for input in graph.input] - input_names = initializer_names + graph_input_names - input_names.sort() - prev_input_name = None - for input_name in input_names: - if prev_input_name == input_name: - continue - - prev_input_name = input_name - if input_name in deps_to_nodes: - for node_idx in deps_to_nodes[input_name]: - deps_count[node_idx] = deps_count[node_idx] - 1 - if deps_count[node_idx] == 0: - sorted_nodes.append(graph.node[node_idx]) - - start = 0 - end = len(sorted_nodes) - - while start < end: - for output in sorted_nodes[start].output: - if output in deps_to_nodes: - for node_idx in deps_to_nodes[output]: - deps_count[node_idx] = deps_count[node_idx] - 1 - if deps_count[node_idx] == 0: - sorted_nodes.append(graph.node[node_idx]) - end = end + 1 - start = start + 1 - - if end != len(graph.node): - raise RuntimeError( - f"Graph is not a DAG: end={end}, len(graph.node)={len(graph.node)}, graph.node[end]={graph.node[end]}" - ) - - graph.ClearField("node") - graph.node.extend(sorted_nodes) - - def topological_sort(self): - # TODO: support graph_topological_sort() in subgraphs - # for graph in self.graphs(): - # self.graph_topological_sort(graph) - OnnxModel.graph_topological_sort(self.model.graph) - - @staticmethod - def save( - model, - output_path, - save_as_external_data=False, - all_tensors_to_one_file=True, - size_threshold=1024, - convert_attribute=False, - ): - Path(output_path).parent.mkdir(parents=True, exist_ok=True) - - if save_as_external_data: - # Save model to external data, which is needed for model size > 2GB - output_dir = Path(output_path).parent - output_dir.mkdir(parents=True, exist_ok=True) - external_data_path = output_path + ".data" - location = ( - Path(external_data_path).name if all_tensors_to_one_file else None - ) - - if os.path.exists(output_path): - logger.info(f"Delete the existed onnx file: {output_path}") - os.remove(output_path) - - if all_tensors_to_one_file: - if os.path.exists(external_data_path): - # Delete the external data file. Otherwise, data will be appended to existing file. - logger.info( - f"Delete the existed external data file: {external_data_path}" - ) - os.remove(external_data_path) - else: - if os.listdir(output_dir): - raise RuntimeError( - f"Output directory ({output_dir}) for external data is not empty." - ) - - save_model( - model, - output_path, - save_as_external_data=True, - all_tensors_to_one_file=all_tensors_to_one_file, - location=location, - size_threshold=size_threshold, - convert_attribute=convert_attribute, - ) - else: - save_model(model, output_path) - - def save_model_to_file( - self, output_path, use_external_data_format=False, all_tensors_to_one_file=True - ): - logger.info(f"Sort graphs in topological order") - self.topological_sort() - - if output_path.endswith(".json"): # Output text for testing small model. - with open(output_path, "w") as out: - out.write(str(model)) - else: - OnnxModel.save( - self.model, - output_path, - use_external_data_format, - all_tensors_to_one_file, - ) - logger.info(f"Model saved to {output_path}") - - def get_graph_inputs_excluding_initializers(self): - """ - Returns real graph inputs (excluding initializers from older onnx model). - """ - graph_inputs = [] - for input in self.model.graph.input: - if self.get_initializer(input.name) is None: - graph_inputs.append(input) - return graph_inputs - - def get_opset_version(self): - """Get opset version of onnx domain - - Raises: - RuntimeError: ONNX model has no opset for default domain. - - Returns: - int: opset version of onnx domain. - """ - for opset in self.model.opset_import: - if opset.domain in ["", "ai.onnx"]: - return opset.version - raise RuntimeError("ONNX model has no opset for default domain") - - @staticmethod - def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool: - """Returns True when two tensors have same value. - Note that name can be different. - - Args: - tensor1 (TensorProto): initializer 1 - tensor2 (TensorProto): initializer 2 - - Returns: - bool: True when two intializers has same value. - """ - if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims: - return False - if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"): - return tensor1.raw_data == tensor2.raw_data - return numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2) - - def remove_duplicated_initializer(self): - """Remove initializers with duplicated values, and only keep the first one. - It could help reduce size of models (like ALBert) with shared weights. - Note: this function does not process subgraph. - """ - if len(self.graphs()) > 1: - logger.warning("remove_duplicated_initializer does not process subgraphs.") - - initializer_count = len(self.model.graph.initializer) - - same = [-1] * initializer_count - for i in range(initializer_count - 1): - if same[i] >= 0: - continue - for j in range(i + 1, initializer_count): - if OnnxModel.has_same_value( - self.model.graph.initializer[i], self.model.graph.initializer[j] - ): - same[j] = i - - count = 0 - for i in range(initializer_count): - if same[i] >= 0: - count += 1 - self.replace_input_of_all_nodes( - self.model.graph.initializer[i].name, - self.model.graph.initializer[same[i]].name, - ) - - if count > 0: - self.update_graph() - print(f"Removed {count} initializers with duplicated value") - - def add_prefix_to_names(self, prefix: str): - """Add prefix to initializer or intermediate outputs in graph. Main graph inputs and outputs are excluded. - It could help avoid conflicting in name of node_args when merging two graphs. - Note: this function does not process subgraph. - """ - if len(self.graphs()) > 1: - logger.warning("add_prefix_to_names does not process subgraphs.") - - # Exclude the names of inputs and outputs of main graph (but not subgraphs) - excluded = [i.name for i in self.model.graph.input] + [ - o.name for o in self.model.graph.output - ] - - for initializer in self.model.graph.initializer: - if initializer.name not in excluded: - if prefix + initializer.name not in excluded: - initializer.name = prefix + initializer.name - - for node in self.model.graph.node: - # update name of node inputs - for j in range(len(node.input)): - if node.input[j] not in excluded: - if prefix + node.input[j] not in excluded: - node.input[j] = prefix + node.input[j] - - # update name of node outputs - for j in range(len(node.output)): - if node.output[j] not in excluded: - if prefix + node.output[j] not in excluded: - node.output[j] = prefix + node.output[j] - - for value_info in self.model.graph.value_info: - if value_info.name not in excluded: - value_info.name = prefix + value_info.name diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py deleted file mode 100644 index a48b53db83fa675713cd9e4ac3b38d2ed554a73b..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -import logging -import os -import sys -from typing import Dict - -# In ORT Package the symbolic_shape_infer.py is in ../tools -file_path = os.path.dirname(__file__) -if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")): - sys.path.append(os.path.join(file_path, "../tools")) -else: - sys.path.append(os.path.join(file_path, "..")) - -from .symbolic_shape_infer import ( - SymbolicShapeInference, - get_shape_from_type_proto, - sympy, -) - -logger = logging.getLogger(__name__) - - -class SymbolicShapeInferenceHelper(SymbolicShapeInference): - def __init__( - self, - model, - verbose=0, - int_max=2**31 - 1, - auto_merge=True, - guess_output_rank=False, - ): - super().__init__(int_max, auto_merge, guess_output_rank, verbose) - self.model_ = model - self.all_shapes_inferred_: bool = False - self.is_inferred_: bool = False - self.dynamic_axis_mapping_: Dict[str, int] = {} - - def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128): - """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided. - - Args: - dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4} - max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32. - - Returns: - bool: whether all shapes has been inferred or not. - """ - assert dynamic_axis_mapping is not None - - if self.is_inferred_ and self.dynamic_axis_mapping_ == dynamic_axis_mapping: - return self.all_shapes_inferred_ - - self.dynamic_axis_mapping_ = dynamic_axis_mapping - - self._preprocess(self.model_) - - count = 0 - while self.run_: - logger.debug(f"shape infer run {count}") - self.all_shapes_inferred_ = self._infer_impl() - count += 1 - if max_runs > 0 and count >= max_runs: - break - - self.is_inferred_ = True - return self.all_shapes_inferred_ - - def _get_sympy_shape(self, node, idx): - """Override it to ensure shape inference by giving the actual value of dynamic axis.""" - sympy_shape = [] - - shape = self._get_shape(node, idx) - if shape: - for dim in shape: - if isinstance(dim, str): - if dim in self.dynamic_axis_mapping_: - sympy_shape.append(self.dynamic_axis_mapping_[dim]) - elif dim in self.symbolic_dims_: - sympy_shape.append(self.symbolic_dims_[dim]) - else: - sympy_shape.append(sympy.Symbol(dim, integer=True)) - else: - assert dim is not None - sympy_shape.append(dim) - return sympy_shape - - def get_edge_shape(self, edge): - """Get shape of an edge. - - Args: - edge (str): name of edge - - Returns: - Optional[List[int]]: the shape, or None if shape is unknown - """ - assert self.all_shapes_inferred_ - if edge not in self.known_vi_: - print("Cannot retrieve the shape of " + str(edge)) - return None - - type_proto = self.known_vi_[edge].type - shape = get_shape_from_type_proto(type_proto) - - if shape is not None: - for i, dim in enumerate(shape): - if isinstance(dim, str) and dim in self.dynamic_axis_mapping_: - shape[i] = self.dynamic_axis_mapping_[dim] - - return shape - - def compare_shape(self, edge, edge_other): - """Compare shape of two edges. - - Args: - edge (str): name of edge - edge_other (str): name of another edge - - Raises: - Exception: At least one shape is missed for edges to compare - - Returns: - bool: whether the shape is same or not - """ - assert self.all_shapes_inferred_ - shape = self.get_edge_shape(edge) - shape_other = self.get_edge_shape(edge_other) - if shape is None or shape_other is None: - raise Exception("At least one shape is missed for edges to compare") - return shape == shape_other diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py deleted file mode 100644 index 2311ad57fdefa502a9e6d7edf44dc884c843ee51..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py +++ /dev/null @@ -1,2805 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -# -*- coding: UTF-8 -*- -import argparse -import logging - -import numpy as np -import onnx -import sympy -from onnx import helper, numpy_helper, shape_inference -from packaging import version - -assert version.parse(onnx.__version__) >= version.parse("1.8.0") - -logger = logging.getLogger(__name__) - - -def get_attribute(node, attr_name, default_value=None): - found = [attr for attr in node.attribute if attr.name == attr_name] - if found: - return helper.get_attribute_value(found[0]) - return default_value - - -def get_dim_from_proto(dim): - return ( - getattr(dim, dim.WhichOneof("value")) - if type(dim.WhichOneof("value")) == str - else None - ) - - -def is_sequence(type_proto): - cls_type = type_proto.WhichOneof("value") - assert cls_type in ["tensor_type", "sequence_type"] - return cls_type == "sequence_type" - - -def get_shape_from_type_proto(type_proto): - assert not is_sequence(type_proto) - if type_proto.tensor_type.HasField("shape"): - return [get_dim_from_proto(d) for d in type_proto.tensor_type.shape.dim] - else: - return None # note no shape is different from shape without dim (scalar) - - -def get_shape_from_value_info(vi): - cls_type = vi.type.WhichOneof("value") - if cls_type is None: - return None - if is_sequence(vi.type): - if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"): - return get_shape_from_type_proto(vi.type.sequence_type.elem_type) - else: - return None - else: - return get_shape_from_type_proto(vi.type) - - -def make_named_value_info(name): - vi = onnx.ValueInfoProto() - vi.name = name - return vi - - -def get_shape_from_sympy_shape(sympy_shape): - return [ - None if i is None else (int(i) if is_literal(i) else str(i)) - for i in sympy_shape - ] - - -def is_literal(dim): - return type(dim) in [int, np.int64, np.int32, sympy.Integer] or ( - hasattr(dim, "is_number") and dim.is_number - ) - - -def handle_negative_axis(axis, rank): - assert axis < rank and axis >= -rank - return axis if axis >= 0 else rank + axis - - -def get_opset(mp, domain=None): - domain = domain or ["", "onnx", "ai.onnx"] - if type(domain) != list: - domain = [domain] - for opset in mp.opset_import: - if opset.domain in domain: - return opset.version - - return None - - -def as_scalar(x): - if type(x) == list: - assert len(x) == 1 - return x[0] - elif type(x) == np.ndarray: - return x.item() - else: - return x - - -def as_list(x, keep_none): - if type(x) == list: - return x - elif type(x) == np.ndarray: - return list(x) - elif keep_none and x is None: - return None - else: - return [x] - - -def sympy_reduce_product(x): - if type(x) == list: - value = sympy.Integer(1) - for v in x: - value = value * v - else: - value = x - return value - - -class SymbolicShapeInference: - def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): - self.dispatcher_ = { - "Add": self._infer_symbolic_compute_ops, - "ArrayFeatureExtractor": self._infer_ArrayFeatureExtractor, - "AveragePool": self._infer_Pool, - "BatchNormalization": self._infer_BatchNormalization, - "Cast": self._infer_Cast, - "CategoryMapper": self._infer_CategoryMapper, - "Compress": self._infer_Compress, - "Concat": self._infer_Concat, - "ConcatFromSequence": self._infer_ConcatFromSequence, - "Constant": self._infer_Constant, - "ConstantOfShape": self._infer_ConstantOfShape, - "Conv": self._infer_Conv, - "CumSum": self._pass_on_shape_and_type, - "Div": self._infer_symbolic_compute_ops, - "Einsum": self._infer_Einsum, - "Expand": self._infer_Expand, - "Equal": self._infer_symbolic_compute_ops, - "Floor": self._infer_symbolic_compute_ops, - "Gather": self._infer_Gather, - "GatherElements": self._infer_GatherElements, - "GatherND": self._infer_GatherND, - "Identity": self._pass_on_shape_and_type, - "If": self._infer_If, - "Loop": self._infer_Loop, - "MatMul": self._infer_MatMul, - "MatMulInteger16": self._infer_MatMulInteger, - "MaxPool": self._infer_Pool, - "Max": self._infer_symbolic_compute_ops, - "Min": self._infer_symbolic_compute_ops, - "Mul": self._infer_symbolic_compute_ops, - "NonMaxSuppression": self._infer_NonMaxSuppression, - "NonZero": self._infer_NonZero, - "OneHot": self._infer_OneHot, - "Pad": self._infer_Pad, - "Range": self._infer_Range, - "Reciprocal": self._pass_on_shape_and_type, - "ReduceSum": self._infer_ReduceSum, - "ReduceProd": self._infer_ReduceProd, - "Reshape": self._infer_Reshape, - "Resize": self._infer_Resize, - "Round": self._pass_on_shape_and_type, - "Scan": self._infer_Scan, - "ScatterElements": self._infer_ScatterElements, - "SequenceAt": self._infer_SequenceAt, - "SequenceInsert": self._infer_SequenceInsert, - "Shape": self._infer_Shape, - "Size": self._infer_Size, - "Slice": self._infer_Slice, - "SoftmaxCrossEntropyLoss": self._infer_SoftmaxCrossEntropyLoss, - "SoftmaxCrossEntropyLossInternal": self._infer_SoftmaxCrossEntropyLoss, - "NegativeLogLikelihoodLossInternal": self._infer_SoftmaxCrossEntropyLoss, - "Split": self._infer_Split, - "SplitToSequence": self._infer_SplitToSequence, - "Squeeze": self._infer_Squeeze, - "Sub": self._infer_symbolic_compute_ops, - "Tile": self._infer_Tile, - "TopK": self._infer_TopK, - "Transpose": self._infer_Transpose, - "Unsqueeze": self._infer_Unsqueeze, - "Where": self._infer_symbolic_compute_ops, - "ZipMap": self._infer_ZipMap, - "Neg": self._infer_symbolic_compute_ops, - # contrib ops: - "Attention": self._infer_Attention, - "BiasGelu": self._infer_BiasGelu, - "EmbedLayerNormalization": self._infer_EmbedLayerNormalization, - "FastGelu": self._infer_FastGelu, - "Gelu": self._infer_Gelu, - "LayerNormalization": self._infer_LayerNormalization, - "LongformerAttention": self._infer_LongformerAttention, - "PythonOp": self._infer_PythonOp, - "SkipLayerNormalization": self._infer_SkipLayerNormalization, - } - self.aten_op_dispatcher_ = { - "embedding": self._infer_Gather, - "bitwise_or": self._infer_aten_bitwise_or, - "diagonal": self._infer_aten_diagonal, - "max_pool2d_with_indices": self._infer_aten_pool2d, - "max": self._infer_aten_minmax, - "min": self._infer_aten_minmax, - "multinomial": self._infer_aten_multinomial, - "unfold": self._infer_aten_unfold, - "argmax": self._infer_aten_argmax, - "avg_pool2d": self._infer_aten_pool2d, - "_adaptive_avg_pool2d": self._infer_aten_pool2d, - "numpy_T": self._infer_Transpose, - } - self.run_ = True - self.suggested_merge_ = {} - self.symbolic_dims_ = {} - self.input_symbols_ = {} - self.auto_merge_ = auto_merge - self.guess_output_rank_ = guess_output_rank - self.verbose_ = verbose - self.int_max_ = int_max - self.subgraph_id_ = 0 - self.prefix_ = prefix - - def _add_suggested_merge(self, symbols, apply=False): - assert all( - [ - (type(s) == str and s in self.symbolic_dims_) or is_literal(s) - for s in symbols - ] - ) - symbols = set(symbols) - for k, v in self.suggested_merge_.items(): - if k in symbols: - symbols.remove(k) - symbols.add(v) - map_to = None - # if there is literal, map to it first - for s in symbols: - if is_literal(s): - map_to = s - break - # when no literals, map to input symbolic dims, then existing symbolic dims - if map_to is None: - for s in symbols: - if s in self.input_symbols_: - map_to = s - break - if map_to is None: - for s in symbols: - if type(self.symbolic_dims_[s]) == sympy.Symbol: - map_to = s - break - # when nothing to map to, use the shorter one - if map_to is None: - if self.verbose_ > 0: - logger.warning( - "Potential unsafe merge between symbolic expressions: ({})".format( - ",".join(symbols) - ) - ) - symbols_list = list(symbols) - lens = [len(s) for s in symbols_list] - map_to = symbols_list[lens.index(min(lens))] - symbols.remove(map_to) - - for s in symbols: - if s == map_to: - continue - if is_literal(map_to) and is_literal(s): - assert int(map_to) == int(s) - self.suggested_merge_[s] = int(map_to) if is_literal(map_to) else map_to - for k, v in self.suggested_merge_.items(): - if v == s: - self.suggested_merge_[k] = map_to - if apply and self.auto_merge_: - self._apply_suggested_merge() - - def _apply_suggested_merge(self, graph_input_only=False): - if not self.suggested_merge_: - return - for i in list(self.out_mp_.graph.input) + ( - [] if graph_input_only else list(self.out_mp_.graph.value_info) - ): - for d in i.type.tensor_type.shape.dim: - if d.dim_param in self.suggested_merge_: - v = self.suggested_merge_[d.dim_param] - if is_literal(v): - d.dim_value = int(v) - else: - d.dim_param = v - - def _preprocess(self, in_mp): - self.out_mp_ = onnx.ModelProto() - self.out_mp_.CopyFrom(in_mp) - self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)]) - self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer]) - self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)]) - self.known_vi_.update( - dict( - [ - ( - i.name, - helper.make_tensor_value_info( - i.name, i.data_type, list(i.dims) - ), - ) - for i in self.out_mp_.graph.initializer - ] - ) - ) - - def _merge_symbols(self, dims): - if not all([type(d) == str for d in dims]): - if self.auto_merge_: - unique_dims = list(set(dims)) - is_int = [is_literal(d) for d in unique_dims] - assert ( - sum(is_int) <= 1 - ) # if there are more than 1 unique ints, something is wrong - if sum(is_int) == 1: - int_dim = is_int.index(1) - if self.verbose_ > 0: - logger.debug( - "dim {} has been merged with value {}".format( - unique_dims[:int_dim] + unique_dims[int_dim + 1 :], - unique_dims[int_dim], - ) - ) - self._check_merged_dims(unique_dims, allow_broadcast=False) - return unique_dims[int_dim] - else: - if self.verbose_ > 0: - logger.debug( - "dim {} has been mergd with dim {}".format( - unique_dims[1:], unique_dims[0] - ) - ) - return dims[0] - else: - return None - if all([d == dims[0] for d in dims]): - return dims[0] - merged = [ - self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims - ] - if all([d == merged[0] for d in merged]): - assert merged[0] in self.symbolic_dims_ - return merged[0] - else: - return None - - # broadcast from right to left, and merge symbolic dims if needed - def _broadcast_shapes(self, shape1, shape2): - new_shape = [] - rank1 = len(shape1) - rank2 = len(shape2) - new_rank = max(rank1, rank2) - for i in range(new_rank): - dim1 = shape1[rank1 - 1 - i] if i < rank1 else 1 - dim2 = shape2[rank2 - 1 - i] if i < rank2 else 1 - if dim1 == 1 or dim1 == dim2: - new_dim = dim2 - elif dim2 == 1: - new_dim = dim1 - else: - new_dim = self._merge_symbols([dim1, dim2]) - if not new_dim: - # warning about unsupported broadcast when not auto merge - # note that auto merge has the risk of incorrectly merge symbols while one of them being 1 - # for example, 'a' = 1, 'b' = 5 at runtime is valid broadcasting, but with auto merge 'a' == 'b' - if self.auto_merge_: - self._add_suggested_merge([dim1, dim2], apply=True) - else: - logger.warning( - "unsupported broadcast between " - + str(dim1) - + " " - + str(dim2) - ) - new_shape = [new_dim] + new_shape - return new_shape - - def _get_shape(self, node, idx): - name = node.input[idx] - if name in self.known_vi_: - vi = self.known_vi_[name] - return get_shape_from_value_info(vi) - else: - assert name in self.initializers_ - return list(self.initializers_[name].dims) - - def _get_shape_rank(self, node, idx): - return len(self._get_shape(node, idx)) - - def _get_sympy_shape(self, node, idx): - sympy_shape = [] - for d in self._get_shape(node, idx): - if type(d) == str: - sympy_shape.append( - self.symbolic_dims_[d] - if d in self.symbolic_dims_ - else sympy.Symbol(d, integer=True, nonnegative=True) - ) - else: - assert None != d - sympy_shape.append(d) - return sympy_shape - - def _get_value(self, node, idx): - name = node.input[idx] - assert name in self.sympy_data_ or name in self.initializers_ - return ( - self.sympy_data_[name] - if name in self.sympy_data_ - else numpy_helper.to_array(self.initializers_[name]) - ) - - def _try_get_value(self, node, idx): - if idx >= len(node.input): - return None - name = node.input[idx] - if name in self.sympy_data_ or name in self.initializers_: - return self._get_value(node, idx) - return None - - def _update_computed_dims(self, new_sympy_shape): - for i, new_dim in enumerate(new_sympy_shape): - if not is_literal(new_dim) and not type(new_dim) == str: - str_dim = str(new_dim) - if str_dim in self.suggested_merge_: - if is_literal(self.suggested_merge_[str_dim]): - continue # no need to create dim for literals - new_sympy_shape[i] = self.symbolic_dims_[ - self.suggested_merge_[str_dim] - ] - else: - # add new_dim if it's a computational expression - if not str(new_dim) in self.symbolic_dims_: - self.symbolic_dims_[str(new_dim)] = new_dim - - def _onnx_infer_single_node(self, node): - # skip onnx shape inference for some ops, as they are handled in _infer_* - skip_infer = node.op_type in [ - "If", - "Loop", - "Scan", - "SplitToSequence", - "ZipMap", # contrib ops - "Attention", - "BiasGelu", - "EmbedLayerNormalization", - "FastGelu", - "Gelu", - "LayerNormalization", - "LongformerAttention", - "SkipLayerNormalization", - "PythonOp", - ] - - if not skip_infer: - # Only pass initializers that satisfy the following condition: - # (1) Operator need value of some input for shape inference. - # For example, Unsqueeze in opset 13 uses the axes input to calculate shape of output. - # (2) opset version >= 9. In older version, initializer is required in graph input by onnx spec. - # (3) The initializer is not in graph input. The means the node input is "constant" in inference. - initializers = [] - if (get_opset(self.out_mp_) >= 9) and node.op_type in ["Unsqueeze"]: - initializers = [ - self.initializers_[name] - for name in node.input - if (name in self.initializers_ and name not in self.graph_inputs_) - ] - - # run single node inference with self.known_vi_ shapes - tmp_graph = helper.make_graph( - [node], - "tmp", - [self.known_vi_[i] for i in node.input if i], - [make_named_value_info(i) for i in node.output], - initializers, - ) - - self.tmp_mp_.graph.CopyFrom(tmp_graph) - - self.tmp_mp_ = shape_inference.infer_shapes(self.tmp_mp_) - - for i_o in range(len(node.output)): - o = node.output[i_o] - vi = self.out_mp_.graph.value_info.add() - if not skip_infer: - vi.CopyFrom(self.tmp_mp_.graph.output[i_o]) - else: - vi.name = o - self.known_vi_[o] = vi - - def _onnx_infer_subgraph( - self, node, subgraph, use_node_input=True, inc_subgraph_id=True - ): - if self.verbose_ > 2: - logger.debug( - "Inferencing subgraph of node {} with output({}...): {}".format( - node.name, node.output[0], node.op_type - ) - ) - # node inputs are not passed directly to the subgraph - # it's up to the node dispatcher to prepare subgraph input - # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape - # besides, inputs in subgraph could shadow implicit inputs - subgraph_inputs = set( - [i.name for i in list(subgraph.initializer) + list(subgraph.input)] - ) - subgraph_implicit_input = set( - [name for name in self.known_vi_.keys() if not name in subgraph_inputs] - ) - tmp_graph = helper.make_graph( - list(subgraph.node), - "tmp", - list(subgraph.input) + [self.known_vi_[i] for i in subgraph_implicit_input], - [make_named_value_info(i.name) for i in subgraph.output], - ) - tmp_graph.initializer.extend( - [ - i - for i in self.out_mp_.graph.initializer - if i.name in subgraph_implicit_input - ] - ) - tmp_graph.initializer.extend(subgraph.initializer) - self.tmp_mp_.graph.CopyFrom(tmp_graph) - - symbolic_shape_inference = SymbolicShapeInference( - self.int_max_, - self.auto_merge_, - self.guess_output_rank_, - self.verbose_, - prefix=self.prefix_ + "_" + str(self.subgraph_id_), - ) - if inc_subgraph_id: - self.subgraph_id_ += 1 - - all_shapes_inferred = False - symbolic_shape_inference._preprocess(self.tmp_mp_) - symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy() - while symbolic_shape_inference.run_: - all_shapes_inferred = symbolic_shape_inference._infer_impl( - self.sympy_data_.copy() - ) - symbolic_shape_inference._update_output_from_vi() - if use_node_input: - # if subgraph uses node input, it needs to update to merged dims - subgraph.ClearField("input") - subgraph.input.extend( - symbolic_shape_inference.out_mp_.graph.input[: len(node.input)] - ) - subgraph.ClearField("output") - subgraph.output.extend(symbolic_shape_inference.out_mp_.graph.output) - subgraph.ClearField("value_info") - subgraph.value_info.extend(symbolic_shape_inference.out_mp_.graph.value_info) - subgraph.ClearField("node") - subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node) - # for new symbolic dims from subgraph output, add to main graph symbolic dims - subgraph_shapes = [ - get_shape_from_value_info(o) - for o in symbolic_shape_inference.out_mp_.graph.output - ] - subgraph_new_symbolic_dims = set( - [ - d - for s in subgraph_shapes - if s - for d in s - if type(d) == str and not d in self.symbolic_dims_ - ] - ) - new_dims = {} - for d in subgraph_new_symbolic_dims: - assert d in symbolic_shape_inference.symbolic_dims_ - new_dims[d] = symbolic_shape_inference.symbolic_dims_[d] - self.symbolic_dims_.update(new_dims) - return symbolic_shape_inference - - def _get_int_values(self, node, broadcast=False): - values = [self._try_get_value(node, i) for i in range(len(node.input))] - if all([v is not None for v in values]): - # some shape compute is in floating point, cast to int for sympy - for i, v in enumerate(values): - if type(v) != np.ndarray: - continue - if len(v.shape) > 1: - new_v = None # ignore value for rank > 1 - elif len(v.shape) == 0: - new_v = int(v.item()) - else: - assert len(v.shape) == 1 - new_v = [int(vv) for vv in v] - values[i] = new_v - values_len = [len(v) if type(v) == list else 0 for v in values] - max_len = max(values_len) - if max_len >= 1 and broadcast: - # broadcast - for i, v in enumerate(values): - if v is None: - continue # don't broadcast if value is unknown - if type(v) == list: - if len(v) < max_len: - values[i] = v * max_len - else: - assert len(v) == max_len - else: - values[i] = [v] * max_len - return values - - def _compute_on_sympy_data(self, node, op_func): - assert len(node.output) == 1 - values = self._get_int_values(node, broadcast=True) - if all([v is not None for v in values]): - is_list = [type(v) == list for v in values] - as_list = any(is_list) - if as_list: - self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)] - else: - self.sympy_data_[node.output[0]] = op_func(values) - - def _pass_on_sympy_data(self, node): - assert len(node.input) == 1 or node.op_type in [ - "Reshape", - "Unsqueeze", - "Squeeze", - ] - self._compute_on_sympy_data(node, lambda x: x[0]) - - def _pass_on_shape_and_type(self, node): - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - self._get_shape(node, 0), - ) - ) - - def _new_symbolic_dim(self, prefix, dim): - new_dim = "{}_d{}".format(prefix, dim) - if new_dim in self.suggested_merge_: - v = self.suggested_merge_[new_dim] - new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v - else: - new_symbolic_dim = sympy.Symbol(new_dim, integer=True, nonnegative=True) - self.symbolic_dims_[new_dim] = new_symbolic_dim - return new_symbolic_dim - - def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0): - return self._new_symbolic_dim( - "{}{}_{}_o{}_".format( - node.op_type, - self.prefix_, - list(self.out_mp_.graph.node).index(node), - out_idx, - ), - dim, - ) - - def _new_symbolic_shape(self, rank, node, out_idx=0): - return [ - self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank) - ] - - def _compute_conv_pool_shape(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - if len(node.input) > 1: - W_shape = self._get_sympy_shape(node, 1) - rank = len(W_shape) - 2 # number of spatial axes - kernel_shape = W_shape[-rank:] - sympy_shape[1] = W_shape[0] - else: - W_shape = None - kernel_shape = get_attribute(node, "kernel_shape") - rank = len(kernel_shape) - - assert len(sympy_shape) == rank + 2 - - # only need to symbolic shape inference if input has symbolic dims in spatial axes - is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]] - - if not any(is_symbolic_dims): - shape = get_shape_from_value_info(self.known_vi_[node.output[0]]) - if len(shape) > 0: - assert len(sympy_shape) == len(shape) - sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]] - return sympy_shape - - dilations = get_attribute(node, "dilations", [1] * rank) - strides = get_attribute(node, "strides", [1] * rank) - effective_kernel_shape = [ - (k - 1) * d + 1 for k, d in zip(kernel_shape, dilations) - ] - pads = get_attribute(node, "pads") - if pads is None: - pads = [0] * (2 * rank) - auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8") - if auto_pad != "VALID" and auto_pad != "NOTSET": - try: - residual = [ - sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides) - ] - total_pads = [ - max(0, (k - s) if r == 0 else (k - r)) - for k, s, r in zip(effective_kernel_shape, strides, residual) - ] - except TypeError: # sympy may throw TypeError: cannot determine truth value of Relational - total_pads = [ - max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides) - ] # assuming no residual if sympy throws error - elif auto_pad == "VALID": - total_pads = [] - else: - total_pads = [0] * rank - else: - assert len(pads) == 2 * rank - total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])] - - ceil_mode = get_attribute(node, "ceil_mode", 0) - for i in range(rank): - effective_input_size = sympy_shape[-rank + i] - if len(total_pads) > 0: - effective_input_size = effective_input_size + total_pads[i] - if ceil_mode: - strided_kernel_positions = sympy.ceiling( - (effective_input_size - effective_kernel_shape[i]) / strides[i] - ) - else: - strided_kernel_positions = ( - effective_input_size - effective_kernel_shape[i] - ) // strides[i] - sympy_shape[-rank + i] = strided_kernel_positions + 1 - return sympy_shape - - def _check_merged_dims(self, dims, allow_broadcast=True): - if allow_broadcast: - dims = [d for d in dims if not (is_literal(d) and int(d) <= 1)] - if not all([d == dims[0] for d in dims]): - self._add_suggested_merge(dims, apply=True) - - def _compute_matmul_shape(self, node, output_dtype=None): - lhs_shape = self._get_shape(node, 0) - rhs_shape = self._get_shape(node, 1) - lhs_rank = len(lhs_shape) - rhs_rank = len(rhs_shape) - lhs_reduce_dim = 0 - rhs_reduce_dim = 0 - assert lhs_rank > 0 and rhs_rank > 0 - if lhs_rank == 1 and rhs_rank == 1: - new_shape = [] - elif lhs_rank == 1: - rhs_reduce_dim = -2 - new_shape = rhs_shape[:rhs_reduce_dim] + [rhs_shape[-1]] - elif rhs_rank == 1: - lhs_reduce_dim = -1 - new_shape = lhs_shape[:lhs_reduce_dim] - else: - lhs_reduce_dim = -1 - rhs_reduce_dim = -2 - new_shape = ( - self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]) - + [lhs_shape[-2]] - + [rhs_shape[-1]] - ) - # merge reduce dim - self._check_merged_dims( - [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]], - allow_broadcast=False, - ) - if output_dtype is None: - # infer output_dtype from input type when not specified - output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info(node.output[0], output_dtype, new_shape) - ) - - def _fuse_tensor_type(self, node, out_idx, dst_type, src_type): - """ - update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches - """ - dst_tensor_type = ( - dst_type.sequence_type.elem_type.tensor_type - if is_sequence(dst_type) - else dst_type.tensor_type - ) - src_tensor_type = ( - src_type.sequence_type.elem_type.tensor_type - if is_sequence(src_type) - else src_type.tensor_type - ) - if dst_tensor_type.elem_type != src_tensor_type.elem_type: - node_id = node.name if node.name else node.op_type - raise ValueError( - f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: " - f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs " - f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}" - ) - if dst_tensor_type.HasField("shape"): - for di, ds in enumerate( - zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim) - ): - if ds[0] != ds[1]: - # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type - # for sequence_type, clear the dimension - new_dim = onnx.TensorShapeProto.Dimension() - if not is_sequence(dst_type): - new_dim.dim_param = str( - self._new_symbolic_dim_from_output(node, out_idx, di) - ) - dst_tensor_type.shape.dim[di].CopyFrom(new_dim) - else: - dst_tensor_type.CopyFrom(src_tensor_type) - - def _infer_ArrayFeatureExtractor(self, node): - data_shape = self._get_shape(node, 0) - indices_shape = self._get_shape(node, 1) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - data_shape[:-1] + indices_shape, - ) - ) - - def _infer_symbolic_compute_ops(self, node): - funcs = { - "Add": lambda l: l[0] + l[1], - "Div": lambda l: l[0] // l[1], # integer div in sympy - "Equal": lambda l: l[0] == l[1], - "Floor": lambda l: sympy.floor(l[0]), - "Max": lambda l: l[1] - if is_literal(l[0]) and int(l[0]) < -self.int_max_ - else ( - l[0] - if is_literal(l[1]) and int(l[1]) < -self.int_max_ - else sympy.Max(l[0], l[1]) - ), - "Min": lambda l: l[1] - if is_literal(l[0]) and int(l[0]) > self.int_max_ - else ( - l[0] - if is_literal(l[1]) and int(l[1]) > self.int_max_ - else sympy.Min(l[0], l[1]) - ), - "Mul": lambda l: l[0] * l[1], - "Sub": lambda l: l[0] - l[1], - "Where": lambda l: l[1] if l[0] else l[2], - "Neg": lambda l: -l[0], - } - assert node.op_type in funcs - self._compute_on_sympy_data(node, funcs[node.op_type]) - - def _infer_Cast(self, node): - self._pass_on_sympy_data(node) - - def _infer_CategoryMapper(self, node): - input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type - if input_type == onnx.TensorProto.STRING: - output_type = onnx.TensorProto.INT64 - else: - output_type = onnx.TensorProto.STRING - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], output_type, self._get_shape(node, 0) - ) - ) - - def _infer_Compress(self, node): - input_shape = self._get_shape(node, 0) - # create a new symbolic dimension for Compress output - compress_len = str(self._new_symbolic_dim_from_output(node)) - axis = get_attribute(node, "axis") - if axis == None: - # when axis is not specified, input is flattened before compress so output is 1D - output_shape = [compress_len] - else: - output_shape = input_shape - output_shape[handle_negative_axis(axis, len(input_shape))] = compress_len - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - output_shape, - ) - ) - - def _infer_Concat(self, node): - if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]): - values = self._get_int_values(node) - if all([v is not None for v in values]): - assert 0 == get_attribute(node, "axis") - self.sympy_data_[node.output[0]] = [] - for i in range(len(node.input)): - value = values[i] - if type(value) == list: - self.sympy_data_[node.output[0]].extend(value) - else: - self.sympy_data_[node.output[0]].append(value) - - sympy_shape = self._get_sympy_shape(node, 0) - axis = handle_negative_axis(get_attribute(node, "axis"), len(sympy_shape)) - for i_idx in range(1, len(node.input)): - input_shape = self._get_sympy_shape(node, i_idx) - if input_shape: - sympy_shape[axis] = sympy_shape[axis] + input_shape[axis] - self._update_computed_dims(sympy_shape) - # merge symbolic dims for non-concat axes - for d in range(len(sympy_shape)): - if d == axis: - continue - dims = [ - self._get_shape(node, i_idx)[d] - for i_idx in range(len(node.input)) - if self._get_shape(node, i_idx) - ] - if all([d == dims[0] for d in dims]): - continue - merged = self._merge_symbols(dims) - if type(merged) == str: - sympy_shape[d] = self.symbolic_dims_[merged] if merged else None - else: - sympy_shape[d] = merged - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape(sympy_shape), - ) - ) - - def _infer_ConcatFromSequence(self, node): - seq_shape = self._get_shape(node, 0) - new_axis = 1 if get_attribute(node, "new_axis") else 0 - axis = handle_negative_axis( - get_attribute(node, "axis"), len(seq_shape) + new_axis - ) - concat_dim = str(self._new_symbolic_dim_from_output(node, 0, axis)) - new_shape = seq_shape - if new_axis: - new_shape = seq_shape[:axis] + [concat_dim] + seq_shape[axis:] - else: - new_shape[axis] = concat_dim - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[ - node.input[0] - ].type.sequence_type.elem_type.tensor_type.elem_type, - new_shape, - ) - ) - - def _infer_Constant(self, node): - t = get_attribute(node, "value") - self.sympy_data_[node.output[0]] = numpy_helper.to_array(t) - - def _infer_ConstantOfShape(self, node): - sympy_shape = self._get_int_values(node)[0] - vi = self.known_vi_[node.output[0]] - if sympy_shape is not None: - if type(sympy_shape) != list: - sympy_shape = [sympy_shape] - self._update_computed_dims(sympy_shape) - # update sympy data if output type is int, and shape is known - if vi.type.tensor_type.elem_type == onnx.TensorProto.INT64 and all( - [is_literal(x) for x in sympy_shape] - ): - self.sympy_data_[node.output[0]] = np.ones( - [int(x) for x in sympy_shape], dtype=np.int64 - ) * numpy_helper.to_array(get_attribute(node, "value", 0)) - else: - # create new dynamic shape - # note input0 is a 1D vector of shape, the new symbolic shape has the rank of the shape vector length - sympy_shape = self._new_symbolic_shape(self._get_shape(node, 0)[0], node) - - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(sympy_shape), - ) - ) - - def _infer_Conv(self, node): - sympy_shape = self._compute_conv_pool_shape(node) - self._update_computed_dims(sympy_shape) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(sympy_shape), - ) - ) - - def _infer_Einsum(self, node): - # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275 - equation = get_attribute(node, "equation") - equation = equation.replace(b" ", b"") - mid_index = equation.find(b"->") - left_equation = equation[:mid_index] if mid_index != -1 else equation - - num_operands = 0 - num_ellipsis = 0 - num_ellipsis_indices = 0 - - letter_to_dim = {} - - terms = left_equation.split(b",") - for term in terms: - ellipsis_index = term.find(b"...") - shape = self._get_shape(node, num_operands) - rank = len(shape) - if ellipsis_index != -1: - if num_ellipsis == 0: - num_ellipsis_indices = rank - len(term) + 3 - num_ellipsis = num_ellipsis + 1 - for i in range(1, rank + 1): - letter = term[-i] - if letter != 46: # letter != b'.' - dim = shape[-i] - if letter not in letter_to_dim.keys(): - letter_to_dim[letter] = dim - elif type(dim) != sympy.Symbol: - letter_to_dim[letter] = dim - num_operands = num_operands + 1 - - new_sympy_shape = [] - from collections import OrderedDict - - num_letter_occurrences = OrderedDict() - if mid_index != -1: - right_equation = equation[mid_index + 2 :] - right_ellipsis_index = right_equation.find(b"...") - if right_ellipsis_index != -1: - for i in range(num_ellipsis_indices): - new_sympy_shape.append(shape[i]) - for c in right_equation: - if c != 46: # c != b'.' - new_sympy_shape.append(letter_to_dim[c]) - else: - for i in range(num_ellipsis_indices): - new_sympy_shape.append(shape[i]) - for c in left_equation: - if c != 44 and c != 46: # c != b',' and c != b'.': - if c in num_letter_occurrences: - num_letter_occurrences[c] = num_letter_occurrences[c] + 1 - else: - num_letter_occurrences[c] = 1 - for key, value in num_letter_occurrences.items(): - if value == 1: - new_sympy_shape.append(letter_to_dim[key]) - - output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape) - ) - - def _infer_Expand(self, node): - expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True) - if expand_to_shape is not None: - # new_shape's dim can come from shape value - self._update_computed_dims(expand_to_shape) - shape = self._get_shape(node, 0) - new_shape = self._broadcast_shapes( - shape, get_shape_from_sympy_shape(expand_to_shape) - ) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - new_shape, - ) - ) - - def _infer_Gather(self, node): - data_shape = self._get_shape(node, 0) - axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape)) - indices_shape = self._get_shape(node, 1) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - data_shape[:axis] + indices_shape + data_shape[axis + 1 :], - ) - ) - # for 1D input, do some sympy compute - if ( - node.input[0] in self.sympy_data_ - and len(data_shape) == 1 - and 0 == get_attribute(node, "axis", 0) - ): - idx = self._try_get_value(node, 1) - if idx is not None: - data = self.sympy_data_[node.input[0]] - if type(data) == list: - if type(idx) == np.ndarray and len(idx.shape) == 1: - self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx] - else: - self.sympy_data_[node.output[0]] = data[int(idx)] - else: - assert idx == 0 or idx == -1 - self.sympy_data_[node.output[0]] = data - - def _infer_GatherElements(self, node): - indices_shape = self._get_shape(node, 1) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - indices_shape, - ) - ) - - def _infer_GatherND(self, node): - data_shape = self._get_shape(node, 0) - data_rank = len(data_shape) - indices_shape = self._get_shape(node, 1) - indices_rank = len(indices_shape) - last_index_dimension = indices_shape[-1] - assert is_literal(last_index_dimension) and last_index_dimension <= data_rank - new_shape = indices_shape[:-1] + data_shape[last_index_dimension:] - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - new_shape, - ) - ) - - def _infer_If(self, node): - # special case for constant condition, in case there are mismatching shape from the non-executed branch - subgraphs = [ - get_attribute(node, "then_branch"), - get_attribute(node, "else_branch"), - ] - cond = self._try_get_value(node, 0) - if cond is not None: - if as_scalar(cond) > 0: - subgraphs[1].CopyFrom(subgraphs[0]) - else: - subgraphs[0].CopyFrom(subgraphs[1]) - - for i_sub, subgraph in enumerate(subgraphs): - subgraph_infer = self._onnx_infer_subgraph( - node, subgraph, use_node_input=False - ) - for i_out in range(len(node.output)): - vi = self.known_vi_[node.output[i_out]] - if i_sub == 0: - vi.CopyFrom(subgraph.output[i_out]) - vi.name = node.output[i_out] - else: - self._fuse_tensor_type( - node, i_out, vi.type, subgraph.output[i_out].type - ) - - # pass on sympy data from subgraph, if cond is constant - if cond is not None and i_sub == (0 if as_scalar(cond) > 0 else 1): - if subgraph.output[i_out].name in subgraph_infer.sympy_data_: - self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[ - subgraph.output[i_out].name - ] - - def _infer_Loop(self, node): - subgraph = get_attribute(node, "body") - assert len(subgraph.input) == len(node.input) - num_loop_carried = ( - len(node.input) - 2 - ) # minus the length and initial loop condition - # when sequence_type is used as loop carried input - # needs to run subgraph infer twice if the tensor shape in sequence contains None - for i, si in enumerate(subgraph.input): - si_name = si.name - si.CopyFrom(self.known_vi_[node.input[i]]) - si.name = si_name - - self._onnx_infer_subgraph(node, subgraph) - - # check subgraph input/output for shape changes in loop carried variables - # for tensor_type, create new symbolic dim when changing, i.e., output = Concat(input, a) - # for sequence_type, propagate from output to input - need_second_infer = False - for i_out in range(1, num_loop_carried + 1): - so = subgraph.output[i_out] - so_shape = get_shape_from_value_info(so) - if is_sequence(so.type): - if so_shape and None in so_shape: - # copy shape from output to input - # note that loop input is [loop_len, cond, input_0, input_1, ...] - # while loop output is [cond, output_0, output_1, ...] - subgraph.input[i_out + 1].type.sequence_type.elem_type.CopyFrom( - so.type.sequence_type.elem_type - ) - need_second_infer = True - else: - si = subgraph.input[i_out + 1] - si_shape = get_shape_from_value_info(si) - for di, dims in enumerate(zip(si_shape, so_shape)): - if dims[0] != dims[1]: - new_dim = onnx.TensorShapeProto.Dimension() - new_dim.dim_param = str( - self._new_symbolic_dim_from_output(node, i_out, di) - ) - si.type.tensor_type.shape.dim[di].CopyFrom(new_dim) - so.type.tensor_type.shape.dim[di].CopyFrom(new_dim) - need_second_infer = True - - if need_second_infer: - if self.verbose_ > 2: - logger.debug( - "Rerun Loop: {}({}...), because of sequence in loop carried variables".format( - node.name, node.output[0] - ) - ) - self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False) - - # create a new symbolic dimension for iteration dependent dimension - loop_iter_dim = str(self._new_symbolic_dim_from_output(node)) - for i in range(len(node.output)): - vi = self.known_vi_[node.output[i]] - vi.CopyFrom( - subgraph.output[i + 1] - ) # first subgraph output is condition, not in node output - if i >= num_loop_carried: - assert not is_sequence( - vi.type - ) # TODO: handle loop accumulation in sequence_type - subgraph_vi_dim = subgraph.output[i + 1].type.tensor_type.shape.dim - vi.type.tensor_type.shape.ClearField("dim") - vi_dim = vi.type.tensor_type.shape.dim - vi_dim.add().dim_param = loop_iter_dim - vi_dim.extend(list(subgraph_vi_dim)) - vi.name = node.output[i] - - def _infer_MatMul(self, node): - self._compute_matmul_shape(node) - - def _infer_MatMulInteger(self, node): - self._compute_matmul_shape(node, onnx.TensorProto.INT32) - - def _infer_NonMaxSuppression(self, node): - selected = str(self._new_symbolic_dim_from_output(node)) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], onnx.TensorProto.INT64, [selected, 3] - ) - ) - - def _infer_NonZero(self, node): - input_rank = self._get_shape_rank(node, 0) - # create a new symbolic dimension for NonZero output - nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1)) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len] - ) - ) - - def _infer_OneHot(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - depth = self._try_get_value(node, 1) - axis = get_attribute(node, "axis", -1) - axis = handle_negative_axis(axis, len(sympy_shape) + 1) - new_shape = get_shape_from_sympy_shape( - sympy_shape[:axis] - + [ - self._new_symbolic_dim_from_output(node) - if not is_literal(depth) - else depth - ] - + sympy_shape[axis:] - ) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[2]].type.tensor_type.elem_type, - new_shape, - ) - ) - - def _infer_Pad(self, node): - if get_opset(self.out_mp_) <= 10: - pads = get_attribute(node, "pads") - else: - pads = self._try_get_value(node, 1) - - sympy_shape = self._get_sympy_shape(node, 0) - rank = len(sympy_shape) - - if pads is not None: - assert len(pads) == 2 * rank - new_sympy_shape = [ - d + pad_up + pad_down - for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:]) - ] - self._update_computed_dims(new_sympy_shape) - else: - # dynamic pads, create new symbolic dimensions - new_sympy_shape = self._new_symbolic_shape(rank, node) - output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type - - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape) - ) - ) - - def _infer_Pool(self, node): - sympy_shape = self._compute_conv_pool_shape(node) - self._update_computed_dims(sympy_shape) - for o in node.output: - if not o: - continue - vi = self.known_vi_[o] - vi.CopyFrom( - helper.make_tensor_value_info( - o, - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(sympy_shape), - ) - ) - - def _infer_aten_bitwise_or(self, node): - shape0 = self._get_shape(node, 0) - shape1 = self._get_shape(node, 1) - new_shape = self._broadcast_shapes(shape0, shape1) - t0 = self.known_vi_[node.input[0]] - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], t0.type.tensor_type.elem_type, new_shape - ) - ) - - def _infer_aten_diagonal(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - rank = len(sympy_shape) - offset = self._try_get_value(node, 1) - dim1 = self._try_get_value(node, 2) - dim2 = self._try_get_value(node, 3) - - assert offset is not None and dim1 is not None and dim2 is not None - dim1 = handle_negative_axis(dim1, rank) - dim2 = handle_negative_axis(dim2, rank) - - new_shape = [] - for dim, val in enumerate(sympy_shape): - if dim not in [dim1, dim2]: - new_shape.append(val) - - shape1 = sympy_shape[dim1] - shape2 = sympy_shape[dim2] - if offset >= 0: - diag_shape = sympy.Max(0, sympy.Min(shape1, shape2 - offset)) - else: - diag_shape = sympy.Max(0, sympy.Min(shape1 + offset, shape2)) - new_shape.append(diag_shape) - - if node.output[0]: - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_shape), - ) - ) - - def _infer_aten_multinomial(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - rank = len(sympy_shape) - assert rank in [1, 2] - num_samples = self._try_get_value(node, 1) - di = rank - 1 - last_dim = ( - num_samples - if num_samples - else str(self._new_symbolic_dim_from_output(node, 0, di)) - ) - output_shape = sympy_shape[:-1] + [last_dim] - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - onnx.TensorProto.INT64, - get_shape_from_sympy_shape(output_shape), - ) - ) - - def _infer_aten_pool2d(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - assert len(sympy_shape) == 4 - sympy_shape[-2:] = [ - self._new_symbolic_dim_from_output(node, 0, i) for i in [2, 3] - ] - self._update_computed_dims(sympy_shape) - for i, o in enumerate(node.output): - if not o: - continue - vi = self.known_vi_[o] - elem_type = ( - onnx.TensorProto.INT64 - if i == 1 - else self.known_vi_[node.input[0]].type.tensor_type.elem_type - ) - vi.CopyFrom( - helper.make_tensor_value_info( - o, elem_type, get_shape_from_sympy_shape(sympy_shape) - ) - ) - - def _infer_aten_minmax(self, node): - vi = self.known_vi_[node.output[0]] - if len(node.input) == 1: - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - [], - ) - ) - else: - assert len(node.input) == 3 - keepdim = self._try_get_value(node, 2) - assert keepdim is not None # can only handle known keepdim case. - dim = self._try_get_value(node, 1) - if dim is None: - rank = self._get_shape_rank(node, 0) - output_shape = self._new_symbolic_shape( - rank if keepdim else rank - 1, node - ) - else: - shape = self._get_sympy_shape(node, 0) - dim = handle_negative_axis(dim, len(shape)) - output_shape = shape[:dim] - if keepdim: - output_shape += [1] - output_shape += shape[dim + 1 :] - - output_shape = get_shape_from_sympy_shape(output_shape) - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - output_shape, - ) - ) - vi1 = self.known_vi_[node.output[1]] - vi1.CopyFrom( - helper.make_tensor_value_info( - node.output[1], onnx.TensorProto.INT64, output_shape - ) - ) - - def _infer_aten_unfold(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - dimension = self._try_get_value(node, 1) - size = self._try_get_value(node, 2) - step = self._try_get_value(node, 3) - if dimension is not None and size is not None and step is not None: - assert dimension < len(sympy_shape) - sympy_shape[dimension] = (sympy_shape[dimension] - size) // step + 1 - sympy_shape.append(size) - else: - rank = len(sympy_shape) - sympy_shape = self._new_symbolic_shape(rank + 1, node) - self._update_computed_dims(sympy_shape) - if node.output[0]: - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape(sympy_shape), - ) - ) - - def _infer_aten_argmax(self, node): - new_shape = None - if node.input[1] == "": - # The argmax of the flattened input is returned. - new_shape = [] - else: - dim = self._try_get_value(node, 1) - keepdim = self._try_get_value(node, 2) - if keepdim is not None: - sympy_shape = self._get_sympy_shape(node, 0) - if dim is not None: - dim = handle_negative_axis(dim, len(sympy_shape)) - if keepdim: - sympy_shape[dim] = 1 - else: - del sympy_shape[dim] - else: - rank = len(sympy_shape) - sympy_shape = self._new_symbolic_shape( - rank if keepdim else rank - 1, node - ) - self._update_computed_dims(sympy_shape) - new_shape = get_shape_from_sympy_shape(sympy_shape) - if node.output[0] and new_shape is not None: - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], onnx.TensorProto.INT64, new_shape - ) - ) - - def _infer_BatchNormalization(self, node): - self._propagate_shape_and_type(node) - - # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop - for i in [1, 2, 3, 4]: - if i < len(node.output) and node.output[i] != "": - # all of these parameters have the same shape as the 1st input - self._propagate_shape_and_type(node, input_index=1, output_index=i) - - def _infer_Range(self, node): - vi = self.known_vi_[node.output[0]] - input_data = self._get_int_values(node) - if all([i is not None for i in input_data]): - start = as_scalar(input_data[0]) - limit = as_scalar(input_data[1]) - delta = as_scalar(input_data[2]) - new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start) / delta), 0)] - else: - new_sympy_shape = [self._new_symbolic_dim_from_output(node)] - self._update_computed_dims(new_sympy_shape) - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_sympy_shape), - ) - ) - - def _infer_ReduceSum(self, node): - keep_dims = get_attribute(node, "keepdims", 1) - if get_opset(self.out_mp_) >= 13 and len(node.input) > 1: - # ReduceSum changes axes to input[1] in opset 13 - axes = self._try_get_value(node, 1) - vi = self.known_vi_[node.output[0]] - if axes is None: - assert keep_dims # can only handle keep_dims==True when axes is unknown, by generating new ranks - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape( - self._new_symbolic_shape( - self._get_shape_rank(node, 0), node - ) - ), - ) - ) - else: - shape = self._get_shape(node, 0) - output_shape = [] - axes = [handle_negative_axis(a, len(shape)) for a in axes] - for i, d in enumerate(shape): - if i in axes: - if keep_dims: - output_shape.append(1) - else: - output_shape.append(d) - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - output_shape, - ) - ) - - def _infer_ReduceProd(self, node): - axes = get_attribute(node, "axes") - keep_dims = get_attribute(node, "keepdims", 1) - if keep_dims == 0 and axes == [0]: - data = self._get_int_values(node)[0] - if data is not None: - self.sympy_data_[node.output[0]] = sympy_reduce_product(data) - - def _infer_Reshape(self, node): - shape_value = self._try_get_value(node, 1) - vi = self.known_vi_[node.output[0]] - if shape_value is None: - shape_shape = self._get_shape(node, 1) - assert len(shape_shape) == 1 - shape_rank = shape_shape[0] - assert is_literal(shape_rank) - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape( - self._new_symbolic_shape(shape_rank, node) - ), - ) - ) - else: - input_sympy_shape = self._get_sympy_shape(node, 0) - total = int(1) - for d in input_sympy_shape: - total = total * d - new_sympy_shape = [] - deferred_dim_idx = -1 - non_deferred_size = int(1) - for i, d in enumerate(shape_value): - if type(d) == sympy.Symbol: - new_sympy_shape.append(d) - elif d == 0: - new_sympy_shape.append(input_sympy_shape[i]) - non_deferred_size = non_deferred_size * input_sympy_shape[i] - else: - new_sympy_shape.append(d) - if d == -1: - deferred_dim_idx = i - elif d != 0: - non_deferred_size = non_deferred_size * d - - assert new_sympy_shape.count(-1) < 2 - if -1 in new_sympy_shape: - new_dim = total // non_deferred_size - new_sympy_shape[deferred_dim_idx] = new_dim - - self._update_computed_dims(new_sympy_shape) - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_sympy_shape), - ) - ) - - self._pass_on_sympy_data(node) - - def _infer_Resize(self, node): - vi = self.known_vi_[node.output[0]] - input_sympy_shape = self._get_sympy_shape(node, 0) - if get_opset(self.out_mp_) <= 10: - scales = self._try_get_value(node, 1) - if scales is not None: - new_sympy_shape = [ - sympy.simplify(sympy.floor(d * s)) - for d, s in zip(input_sympy_shape, scales) - ] - self._update_computed_dims(new_sympy_shape) - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_sympy_shape), - ) - ) - else: - roi = self._try_get_value(node, 1) - scales = self._try_get_value(node, 2) - sizes = self._try_get_value(node, 3) - if sizes is not None: - new_sympy_shape = [sympy.simplify(sympy.floor(s)) for s in sizes] - self._update_computed_dims(new_sympy_shape) - elif scales is not None: - rank = len(scales) - if ( - get_attribute(node, "coordinate_transformation_mode") - == "tf_crop_and_resize" - ): - assert len(roi) == 2 * rank - roi_start = list(roi)[:rank] - roi_end = list(roi)[rank:] - else: - roi_start = [0] * rank - roi_end = [1] * rank - scales = list(scales) - new_sympy_shape = [ - sympy.simplify(sympy.floor(d * (end - start) * scale)) - for d, start, end, scale in zip( - input_sympy_shape, roi_start, roi_end, scales - ) - ] - self._update_computed_dims(new_sympy_shape) - else: - new_sympy_shape = self._new_symbolic_shape( - self._get_shape_rank(node, 0), node - ) - - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_sympy_shape), - ) - ) - - def _infer_Scan(self, node): - subgraph = get_attribute(node, "body") - num_scan_inputs = get_attribute(node, "num_scan_inputs") - scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs) - num_scan_states = len(node.input) - num_scan_inputs - scan_input_axes = [ - handle_negative_axis(ax, self._get_shape_rank(node, i + num_scan_states)) - for i, ax in enumerate(scan_input_axes) - ] - # We may have cases where the subgraph has optional inputs that appear in both subgraph's input and initializer, - # but not in the node's input. In such cases, the input model might be invalid, but let's skip those optional inputs. - assert len(subgraph.input) >= len(node.input) - subgraph_inputs = subgraph.input[: len(node.input)] - for i, si in enumerate(subgraph_inputs): - subgraph_name = si.name - si.CopyFrom(self.known_vi_[node.input[i]]) - if i >= num_scan_states: - scan_input_dim = si.type.tensor_type.shape.dim - scan_input_dim.remove( - scan_input_dim[scan_input_axes[i - num_scan_states]] - ) - si.name = subgraph_name - self._onnx_infer_subgraph(node, subgraph) - num_scan_outputs = len(node.output) - num_scan_states - scan_output_axes = get_attribute( - node, "scan_output_axes", [0] * num_scan_outputs - ) - scan_input_dim = get_shape_from_type_proto(self.known_vi_[node.input[-1]].type)[ - scan_input_axes[-1] - ] - for i, o in enumerate(node.output): - vi = self.known_vi_[o] - if i >= num_scan_states: - shape = get_shape_from_type_proto(subgraph.output[i].type) - new_dim = handle_negative_axis( - scan_output_axes[i - num_scan_states], len(shape) + 1 - ) - shape = shape[:new_dim] + [scan_input_dim] + shape[new_dim:] - vi.CopyFrom( - helper.make_tensor_value_info( - o, subgraph.output[i].type.tensor_type.elem_type, shape - ) - ) - else: - vi.CopyFrom(subgraph.output[i]) - vi.name = o - - def _infer_ScatterElements(self, node): - data_shape = self._get_shape(node, 0) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - data_shape, - ) - ) - - def _infer_SequenceAt(self, node): - # need to create new symbolic dimension if sequence shape has None: - seq_shape = self._get_shape(node, 0) - vi = self.known_vi_[node.output[0]] - if seq_shape is not None: - for di, d in enumerate(seq_shape): - if d is not None: - continue - new_dim = onnx.TensorShapeProto.Dimension() - new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di)) - vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim) - - def _infer_SequenceInsert(self, node): - # workaround bug in onnx's shape inference - vi_seq = self.known_vi_[node.input[0]] - vi_tensor = self.known_vi_[node.input[1]] - vi_out_seq = self.known_vi_[node.output[0]] - vi_out_seq.CopyFrom(vi_seq) - vi_out_seq.name = node.output[0] - self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type) - - def _infer_Shape(self, node): - self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0) - - def _infer_Size(self, node): - sympy_shape = self._get_sympy_shape(node, 0) - self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape) - self.known_vi_[node.output[0]].CopyFrom( - helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, []) - ) - - def _infer_Slice(self, node): - def less_equal(x, y): - try: - return bool(x <= y) - except TypeError: - pass - try: - return bool(y >= x) - except TypeError: - pass - try: - return bool(-x >= -y) - except TypeError: - pass - try: - return bool(-y <= -x) - except TypeError: - # the last attempt; this may raise TypeError - return bool(y - x >= 0) - - def handle_negative_index(index, bound): - """normalizes a negative index to be in [0, bound)""" - try: - if not less_equal(0, index): - if is_literal(index) and index <= -self.int_max_: - # this case is handled separately - return index - return bound + index - except TypeError: - logger.warning("Cannot determine if {} < 0".format(index)) - return index - - if get_opset(self.out_mp_) <= 9: - axes = get_attribute(node, "axes") - starts = get_attribute(node, "starts") - ends = get_attribute(node, "ends") - if not axes: - axes = list(range(len(starts))) - steps = [1] * len(axes) - else: - starts = as_list(self._try_get_value(node, 1), keep_none=True) - ends = as_list(self._try_get_value(node, 2), keep_none=True) - axes = self._try_get_value(node, 3) - steps = self._try_get_value(node, 4) - if axes is None and not (starts is None and ends is None): - axes = list(range(0, len(starts if starts is not None else ends))) - if steps is None and not (starts is None and ends is None): - steps = [1] * len(starts if starts is not None else ends) - axes = as_list(axes, keep_none=True) - steps = as_list(steps, keep_none=True) - - new_sympy_shape = self._get_sympy_shape(node, 0) - if starts is None or ends is None: - if axes is None: - for i in range(len(new_sympy_shape)): - new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i) - else: - new_sympy_shape = get_shape_from_sympy_shape(new_sympy_shape) - for i in axes: - new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i) - else: - for i, s, e, t in zip(axes, starts, ends, steps): - e = handle_negative_index(e, new_sympy_shape[i]) - if is_literal(e): - if e >= self.int_max_: - e = new_sympy_shape[i] - elif e <= -self.int_max_: - e = 0 if s > 0 else -1 - elif is_literal(new_sympy_shape[i]): - if e < 0: - e = max(0, e + new_sympy_shape[i]) - e = min(e, new_sympy_shape[i]) - else: - if e > 0: - e = ( - sympy.Min(e, new_sympy_shape[i]) if e > 1 else e - ) # special case for slicing first to make computation easier - else: - if is_literal(new_sympy_shape[i]): - e = sympy.Min(e, new_sympy_shape[i]) - else: - try: - if not less_equal(e, new_sympy_shape[i]): - e = new_sympy_shape[i] - except Exception: - logger.warning( - "Unable to determine if {} <= {}, treat as equal".format( - e, new_sympy_shape[i] - ) - ) - e = new_sympy_shape[i] - - s = handle_negative_index(s, new_sympy_shape[i]) - if is_literal(new_sympy_shape[i]) and is_literal(s): - s = max(0, min(s, new_sympy_shape[i])) - - new_sympy_shape[i] = sympy.simplify( - (e - s + t + (-1 if t > 0 else 1)) // t - ) - - self._update_computed_dims(new_sympy_shape) - - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_sympy_shape), - ) - ) - - # handle sympy_data if needed, for slice in shape computation - if ( - node.input[0] in self.sympy_data_ - and [0] == axes - and len(starts) == 1 - and len(ends) == 1 - and len(steps) == 1 - ): - input_sympy_data = self.sympy_data_[node.input[0]] - if type(input_sympy_data) == list or ( - type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1 - ): - self.sympy_data_[node.output[0]] = input_sympy_data[ - starts[0] : ends[0] : steps[0] - ] - - def _infer_SoftmaxCrossEntropyLoss(self, node): - vi = self.known_vi_[node.output[0]] - elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi.type.tensor_type.elem_type = elem_type - vi.type.tensor_type.shape.CopyFrom(onnx.TensorShapeProto()) - - if len(node.output) > 1: - data_shape = self._get_shape(node, 0) - vi = self.known_vi_[node.output[1]] - vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape)) - - def _infer_Split_Common(self, node, make_value_info_func): - input_sympy_shape = self._get_sympy_shape(node, 0) - axis = handle_negative_axis( - get_attribute(node, "axis", 0), len(input_sympy_shape) - ) - split = get_attribute(node, "split") - if not split: - num_outputs = len(node.output) - split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs - self._update_computed_dims(split) - else: - split = [sympy.Integer(s) for s in split] - - for i_o in range(len(split)): - vi = self.known_vi_[node.output[i_o]] - vi.CopyFrom( - make_value_info_func( - node.output[i_o], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - get_shape_from_sympy_shape( - input_sympy_shape[:axis] - + [split[i_o]] - + input_sympy_shape[axis + 1 :] - ), - ) - ) - self.known_vi_[vi.name] = vi - - def _infer_Split(self, node): - self._infer_Split_Common(node, helper.make_tensor_value_info) - - def _infer_SplitToSequence(self, node): - self._infer_Split_Common(node, helper.make_sequence_value_info) - - def _infer_Squeeze(self, node): - input_shape = self._get_shape(node, 0) - op_set = get_opset(self.out_mp_) - - # Depending on op-version 'axes' are provided as attribute or via 2nd input - if op_set < 13: - axes = get_attribute(node, "axes") - assert self._try_get_value(node, 1) is None - else: - axes = self._try_get_value(node, 1) - assert get_attribute(node, "axes") is None - - if axes is None: - # No axes have been provided (neither via attribute nor via input). - # In this case the 'Shape' op should remove all axis with dimension 1. - # For symbolic dimensions we guess they are !=1. - output_shape = [s for s in input_shape if s != 1] - if self.verbose_ > 0: - symbolic_dimensions = [s for s in input_shape if type(s) != int] - if len(symbolic_dimensions) > 0: - logger.debug( - f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. " - + f"Assuming the following dimensions are never equal to 1: {symbolic_dimensions}" - ) - else: - axes = [handle_negative_axis(a, len(input_shape)) for a in axes] - output_shape = [] - for i in range(len(input_shape)): - if i not in axes: - output_shape.append(input_shape[i]) - else: - assert input_shape[i] == 1 or type(input_shape[i]) != int - if self.verbose_ > 0 and type(input_shape[i]) != int: - logger.debug( - f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. " - + f"Assuming the dimension '{input_shape[i]}' at index {i} of the input to be equal to 1." - ) - - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - output_shape, - ) - ) - self._pass_on_sympy_data(node) - - def _infer_Tile(self, node): - repeats_value = self._try_get_value(node, 1) - new_sympy_shape = [] - if repeats_value is not None: - input_sympy_shape = self._get_sympy_shape(node, 0) - for i, d in enumerate(input_sympy_shape): - new_dim = d * repeats_value[i] - new_sympy_shape.append(new_dim) - self._update_computed_dims(new_sympy_shape) - else: - new_sympy_shape = self._new_symbolic_shape( - self._get_shape_rank(node, 0), node - ) - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_sympy_shape), - ) - ) - - def _infer_TopK(self, node): - rank = self._get_shape_rank(node, 0) - axis = handle_negative_axis(get_attribute(node, "axis", -1), rank) - new_shape = self._get_shape(node, 0) - - if get_opset(self.out_mp_) <= 9: - k = get_attribute(node, "k") - else: - k = self._get_int_values(node)[1] - - if k == None: - k = self._new_symbolic_dim_from_output(node) - else: - k = as_scalar(k) - - if type(k) in [int, str]: - new_shape[axis] = k - else: - new_sympy_shape = self._get_sympy_shape(node, 0) - new_sympy_shape[axis] = k - self._update_computed_dims( - new_sympy_shape - ) # note that TopK dim could be computed in sympy_data, so need to update computed_dims when it enters shape - new_shape = get_shape_from_sympy_shape(new_sympy_shape) - - for i_o in range(len(node.output)): - vi = self.known_vi_[node.output[i_o]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[i_o], vi.type.tensor_type.elem_type, new_shape - ) - ) - - def _infer_Transpose(self, node): - if node.input[0] in self.sympy_data_: - data_shape = self._get_shape(node, 0) - perm = get_attribute(node, "perm", reversed(list(range(len(data_shape))))) - input_data = self.sympy_data_[node.input[0]] - self.sympy_data_[node.output[0]] = ( - np.transpose( - np.array(input_data).reshape(*data_shape), axes=tuple(perm) - ) - .flatten() - .tolist() - ) - - def _infer_Unsqueeze(self, node): - input_shape = self._get_shape(node, 0) - op_set = get_opset(self.out_mp_) - - # Depending on op-version 'axes' are provided as attribute or via 2nd input - if op_set < 13: - axes = get_attribute(node, "axes") - assert self._try_get_value(node, 1) is None - else: - axes = self._try_get_value(node, 1) - assert get_attribute(node, "axes") is None - - output_rank = len(input_shape) + len(axes) - axes = [handle_negative_axis(a, output_rank) for a in axes] - - input_axis = 0 - output_shape = [] - for i in range(output_rank): - if i in axes: - output_shape.append(1) - else: - output_shape.append(input_shape[input_axis]) - input_axis += 1 - - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], - self.known_vi_[node.input[0]].type.tensor_type.elem_type, - output_shape, - ) - ) - - self._pass_on_sympy_data(node) - - def _infer_ZipMap(self, node): - map_key_type = None - if get_attribute(node, "classlabels_int64s") is not None: - map_key_type = onnx.TensorProto.INT64 - elif get_attribute(node, "classlabels_strings") is not None: - map_key_type = onnx.TensorProto.STRING - - assert map_key_type is not None - new_vi = onnx.ValueInfoProto() - new_vi.name = node.output[0] - new_vi.type.sequence_type.elem_type.map_type.value_type.tensor_type.elem_type = ( - onnx.TensorProto.FLOAT - ) - new_vi.type.sequence_type.elem_type.map_type.key_type = map_key_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom(new_vi) - - def _infer_Attention(self, node): - shape = self._get_shape(node, 0) - shape_bias = self._get_shape(node, 2) - assert len(shape) == 3 and len(shape_bias) == 1 - qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes") - if qkv_hidden_sizes_attr is not None: - assert len(qkv_hidden_sizes_attr) == 3 - shape[2] = int(qkv_hidden_sizes_attr[2]) - else: - shape[2] = int(shape_bias[0] / 3) - output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape)) - - if len(node.output) > 1: - # input shape: (batch_size, sequence_length, hidden_size) - # past shape: (2, batch_size, num_heads, past_sequence_length, head_size) - # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len) - # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length - input_shape = self._get_shape(node, 0) - past_shape = self._get_shape(node, 4) - mask_shape = self._get_shape(node, 3) - if len(past_shape) == 5: - if len(mask_shape) in [2, 3]: - past_shape[3] = mask_shape[-1] - elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int): - past_shape[3] = input_shape[1] + past_shape[3] - else: - past_shape[3] = f"{past_shape[3]}+{input_shape[1]}" - vi = self.known_vi_[node.output[1]] - vi.CopyFrom( - helper.make_tensor_value_info(vi.name, output_dtype, past_shape) - ) - - def _infer_BiasGelu(self, node): - self._propagate_shape_and_type(node) - - def _infer_FastGelu(self, node): - self._propagate_shape_and_type(node) - - def _infer_Gelu(self, node): - self._propagate_shape_and_type(node) - - def _infer_LayerNormalization(self, node): - self._propagate_shape_and_type(node) - - def _infer_LongformerAttention(self, node): - self._propagate_shape_and_type(node) - - def _infer_EmbedLayerNormalization(self, node): - input_ids_shape = self._get_shape(node, 0) - word_embedding_shape = self._get_shape(node, 2) - assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2 - output_shape = input_ids_shape + [word_embedding_shape[1]] - - word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[0], word_embedding_dtype, output_shape - ) - ) - - mask_index_shape = [input_ids_shape[0]] - vi = self.known_vi_[node.output[1]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[1], onnx.TensorProto.INT32, mask_index_shape - ) - ) - - if len(node.output) > 2: - # Optional output of add before layer nomalization is done - # shape is same as the output - vi = self.known_vi_[node.output[2]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[2], word_embedding_dtype, output_shape - ) - ) - - def _infer_SkipLayerNormalization(self, node): - self._propagate_shape_and_type(node) - - def _infer_PythonOp(self, node): - output_tensor_types = get_attribute(node, "output_tensor_types") - assert output_tensor_types - output_tensor_ranks = get_attribute(node, "output_tensor_ranks") - assert output_tensor_ranks - - # set the context output seperately. - # The first output is autograd's context. - vi = self.known_vi_[node.output[0]] - vi.CopyFrom( - helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, []) - ) - - # Outputs after autograd's context are tensors. - # We assume their ranks are fixed for different model inputs. - for i in range(len(node.output) - 1): - # Process the i-th tensor outputs. - vi = self.known_vi_[node.output[i + 1]] - sympy_shape = self._new_symbolic_shape(output_tensor_ranks[i], node) - shape = get_shape_from_sympy_shape(sympy_shape) - value_info = helper.make_tensor_value_info( - node.output[i + 1], output_tensor_types[i], shape - ) - vi.CopyFrom(value_info) - - def _propagate_shape_and_type(self, node, input_index=0, output_index=0): - shape = self._get_shape(node, input_index) - output_dtype = self.known_vi_[ - node.input[input_index] - ].type.tensor_type.elem_type - vi = self.known_vi_[node.output[output_index]] - vi.CopyFrom( - helper.make_tensor_value_info( - node.output[output_index], output_dtype, shape - ) - ) - - def _is_none_dim(self, dim_value): - if type(dim_value) != str: - return False - if "unk__" not in dim_value: - return False - if dim_value in self.symbolic_dims_.keys(): - return False - return True - - def _is_shape_contains_none_dim(self, out_shape): - for out in out_shape: - if self._is_none_dim(out): - return out - return None - - def _infer_impl(self, start_sympy_data=None): - self.sympy_data_ = start_sympy_data or {} - self.out_mp_.graph.ClearField("value_info") - self._apply_suggested_merge(graph_input_only=True) - self.input_symbols_ = set() - for i in self.out_mp_.graph.input: - input_shape = get_shape_from_value_info(i) - if input_shape is None: - continue - - if is_sequence(i.type): - input_dims = i.type.sequence_type.elem_type.tensor_type.shape.dim - else: - input_dims = i.type.tensor_type.shape.dim - - for i_dim, dim in enumerate(input_shape): - if dim is None: - # some models use None for symbolic dim in input, replace it with a string - input_dims[i_dim].dim_param = str( - self._new_symbolic_dim(i.name, i_dim) - ) - - self.input_symbols_.update([d for d in input_shape if type(d) == str]) - - for s in self.input_symbols_: - if s in self.suggested_merge_: - s_merge = self.suggested_merge_[s] - assert s_merge in self.symbolic_dims_ - self.symbolic_dims_[s] = self.symbolic_dims_[s_merge] - else: - # Since inputs are not produced by other ops, we can assume positivity - self.symbolic_dims_[s] = sympy.Symbol(s, integer=True, positive=True) - # create a temporary ModelProto for single node inference - # note that we remove initializer to have faster inference - # for tensor ops like Reshape/Tile/Expand that read initializer, we need to do sympy computation based inference anyways - self.tmp_mp_ = onnx.ModelProto() - self.tmp_mp_.CopyFrom(self.out_mp_) - self.tmp_mp_.graph.ClearField("initializer") - - # compute prerequesite for node for topological sort - # node with subgraphs may have dependency on implicit inputs, which will affect topological sort - prereq_for_node = ( - {} - ) # map from node to all its inputs, including implicit ones in subgraph - - def get_prereq(node): - names = set(i for i in node.input if i) - subgraphs = [] - if "If" == node.op_type: - subgraphs = [ - get_attribute(node, "then_branch"), - get_attribute(node, "else_branch"), - ] - elif node.op_type in ["Loop", "Scan"]: - subgraphs = [get_attribute(node, "body")] - for g in subgraphs: - g_outputs_and_initializers = {i.name for i in g.initializer} - g_prereq = set() - for n in g.node: - g_outputs_and_initializers.update(n.output) - for n in g.node: - g_prereq.update( - [ - i - for i in get_prereq(n) - if i not in g_outputs_and_initializers - ] - ) - names.update(g_prereq) - # remove subgraph inputs from g_prereq since those are local-only - for i in g.input: - if i.name in names: - names.remove(i.name) - return names - - for n in self.tmp_mp_.graph.node: - prereq_for_node[n.output[0]] = get_prereq(n) - - # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate - sorted_nodes = [] - sorted_known_vi = set( - [ - i.name - for i in list(self.out_mp_.graph.input) - + list(self.out_mp_.graph.initializer) - ] - ) - if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]): - # Loop/Scan will have some graph output in graph inputs, so don't do topological sort - sorted_nodes = self.out_mp_.graph.node - else: - while not all( - [o.name in sorted_known_vi for o in self.out_mp_.graph.output] - ): - old_sorted_nodes_len = len(sorted_nodes) - for node in self.out_mp_.graph.node: - if (node.output[0] not in sorted_known_vi) and all( - [ - i in sorted_known_vi - for i in prereq_for_node[node.output[0]] - if i - ] - ): - sorted_known_vi.update(node.output) - sorted_nodes.append(node) - if old_sorted_nodes_len == len(sorted_nodes) and not all( - [o.name in sorted_known_vi for o in self.out_mp_.graph.output] - ): - raise Exception("Invalid model with cyclic graph") - - for node in sorted_nodes: - assert all([i in self.known_vi_ for i in node.input if i]) - self._onnx_infer_single_node(node) - known_aten_op = False - if node.op_type in self.dispatcher_: - self.dispatcher_[node.op_type](node) - elif node.op_type in ["ConvTranspose"]: - # onnx shape inference ops like ConvTranspose may have empty shape for symbolic input - # before adding symbolic compute for them - # mark the output type as UNDEFINED to allow guessing of rank - vi = self.known_vi_[node.output[0]] - if len(vi.type.tensor_type.shape.dim) == 0: - vi.type.tensor_type.elem_type = onnx.TensorProto.UNDEFINED - elif node.op_type == "ATen" and node.domain == "org.pytorch.aten": - for attr in node.attribute: - # TODO: Is overload_name needed? - if attr.name == "operator": - aten_op_name = ( - attr.s.decode("utf-8") - if isinstance(attr.s, bytes) - else attr.s - ) - if aten_op_name in self.aten_op_dispatcher_: - known_aten_op = True - self.aten_op_dispatcher_[aten_op_name](node) - break - - if self.verbose_ > 2: - logger.debug(node.op_type + ": " + node.name) - for i, name in enumerate(node.input): - logger.debug( - " Input {}: {} {}".format( - i, name, "initializer" if name in self.initializers_ else "" - ) - ) - - # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb'] - # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case - if node.op_type in [ - "Add", - "Sub", - "Mul", - "Div", - "MatMul", - "MatMulInteger", - "MatMulInteger16", - "Where", - "Sum", - ]: - vi = self.known_vi_[node.output[0]] - out_rank = len(get_shape_from_type_proto(vi.type)) - in_shapes = [self._get_shape(node, i) for i in range(len(node.input))] - for d in range( - out_rank - - ( - 2 - if node.op_type - in ["MatMul", "MatMulInteger", "MatMulInteger16"] - else 0 - ) - ): - in_dims = [ - s[len(s) - out_rank + d] - for s in in_shapes - if len(s) + d >= out_rank - ] - if len(in_dims) > 1: - self._check_merged_dims(in_dims, allow_broadcast=True) - - for i_o in range(len(node.output)): - vi = self.known_vi_[node.output[i_o]] - out_type = vi.type - out_type_kind = out_type.WhichOneof("value") - - # do not process shape for non-tensors - if out_type_kind not in ["tensor_type", "sparse_tensor_type", None]: - if self.verbose_ > 2: - if out_type_kind == "sequence_type": - seq_cls_type = out_type.sequence_type.elem_type.WhichOneof( - "value" - ) - if "tensor_type" == seq_cls_type: - logger.debug( - " {}: sequence of {} {}".format( - node.output[i_o], - str(get_shape_from_value_info(vi)), - onnx.TensorProto.DataType.Name( - vi.type.sequence_type.elem_type.tensor_type.elem_type - ), - ) - ) - else: - logger.debug( - " {}: sequence of {}".format( - node.output[i_o], seq_cls_type - ) - ) - else: - logger.debug( - " {}: {}".format(node.output[i_o], out_type_kind) - ) - continue - - out_shape = get_shape_from_value_info(vi) - out_type_undefined = ( - out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED - ) - if self.verbose_ > 2: - logger.debug( - " {}: {} {}".format( - node.output[i_o], - str(out_shape), - onnx.TensorProto.DataType.Name( - vi.type.tensor_type.elem_type - ), - ) - ) - if node.output[i_o] in self.sympy_data_: - logger.debug( - " Sympy Data: " + str(self.sympy_data_[node.output[i_o]]) - ) - - # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain - if ( - out_shape is not None - and ( - None in out_shape or self._is_shape_contains_none_dim(out_shape) - ) - ) or out_type_undefined: - if self.auto_merge_: - if node.op_type in [ - "Add", - "Sub", - "Mul", - "Div", - "MatMul", - "MatMulInteger", - "MatMulInteger16", - "Concat", - "Where", - "Sum", - "Equal", - "Less", - "Greater", - "LessOrEqual", - "GreaterOrEqual", - "Min", - "Max", - ]: - shapes = [ - self._get_shape(node, i) for i in range(len(node.input)) - ] - if node.op_type in [ - "MatMul", - "MatMulInteger", - "MatMulInteger16", - ]: - if ( - None in out_shape - or self._is_shape_contains_none_dim(out_shape) - ): - if None in out_shape: - idx = out_shape.index(None) - else: - idx = out_shape.index( - self._is_shape_contains_none_dim(out_shape) - ) - dim_idx = [ - len(s) - len(out_shape) + idx for s in shapes - ] - # only support auto merge for MatMul for dim < rank-2 when rank > 2 - assert ( - len(shapes[0]) > 2 - and dim_idx[0] < len(shapes[0]) - 2 - ) - assert ( - len(shapes[1]) > 2 - and dim_idx[1] < len(shapes[1]) - 2 - ) - elif node.op_type == "Expand": - # auto merge for cases like Expand([min(batch, 1), min(seq, 512)], [batch, seq]) - shapes = [ - self._get_shape(node, 0), - self._get_value(node, 1), - ] - else: - shapes = [] - - if shapes: - for idx in range(len(out_shape)): - if out_shape[idx] is not None and not self._is_none_dim( - out_shape[idx] - ): - continue - # note that the broadcasting rule aligns from right to left - # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge - dim_idx = [ - len(s) - len(out_shape) + idx for s in shapes - ] - if len(dim_idx) > 0: - self._add_suggested_merge( - [ - s[i] if is_literal(s[i]) else str(s[i]) - for s, i in zip(shapes, dim_idx) - if i >= 0 - ] - ) - self.run_ = True - else: - self.run_ = False - else: - self.run_ = False - - # create new dynamic dims for ops not handled by symbolic shape inference - if ( - self.run_ == False - and not node.op_type in self.dispatcher_ - and not known_aten_op - ): - is_unknown_op = out_type_undefined and ( - out_shape is None or len(out_shape) == 0 - ) - if is_unknown_op: - # unknown op to ONNX, maybe from higher opset or other domain - # only guess the output rank from input 0 when using guess_output_rank option - out_rank = ( - self._get_shape_rank(node, 0) - if self.guess_output_rank_ - else -1 - ) - else: - # valid ONNX op, but not handled by symbolic shape inference, just assign dynamic shape - out_rank = len(out_shape) - - if out_rank >= 0: - new_shape = self._new_symbolic_shape(out_rank, node, i_o) - if out_type_undefined: - # guess output data type from input vi if not defined - out_dtype = self.known_vi_[ - node.input[0] - ].type.tensor_type.elem_type - else: - # otherwise, use original data type - out_dtype = vi.type.tensor_type.elem_type - vi.CopyFrom( - helper.make_tensor_value_info( - vi.name, - out_dtype, - get_shape_from_sympy_shape(new_shape), - ) - ) - - if self.verbose_ > 0: - if is_unknown_op: - logger.debug( - "Possible unknown op: {} node: {}, guessing {} shape".format( - node.op_type, node.name, vi.name - ) - ) - if self.verbose_ > 2: - logger.debug( - " {}: {} {}".format( - node.output[i_o], - str(new_shape), - vi.type.tensor_type.elem_type, - ) - ) - - self.run_ = True - continue # continue the inference after guess, no need to stop as no merge is needed - - if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined: - logger.debug( - "Stopping at incomplete shape inference at " - + node.op_type - + ": " - + node.name - ) - logger.debug("node inputs:") - for i in node.input: - logger.debug(self.known_vi_[i]) - logger.debug("node outputs:") - for o in node.output: - logger.debug(self.known_vi_[o]) - if self.auto_merge_ and not out_type_undefined: - logger.debug("Merging: " + str(self.suggested_merge_)) - return False - - self.run_ = False - return True - - def _update_output_from_vi(self): - for output in self.out_mp_.graph.output: - if output.name in self.known_vi_: - output.CopyFrom(self.known_vi_[output.name]) - - @staticmethod - def infer_shapes( - in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0 - ): - onnx_opset = get_opset(in_mp) - if (not onnx_opset) or onnx_opset < 7: - logger.warning("Only support models of onnx opset 7 and above.") - return None - symbolic_shape_inference = SymbolicShapeInference( - int_max, auto_merge, guess_output_rank, verbose - ) - all_shapes_inferred = False - symbolic_shape_inference._preprocess(in_mp) - while symbolic_shape_inference.run_: - all_shapes_inferred = symbolic_shape_inference._infer_impl() - symbolic_shape_inference._update_output_from_vi() - if not all_shapes_inferred: - logger.warning("Incomplete symbolic shape inference") - return symbolic_shape_inference.out_mp_ - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--input", required=True, help="The input model file") - parser.add_argument("--output", help="The output model file") - parser.add_argument( - "--auto_merge", - help="Automatically merge symbolic dims when confliction happens", - action="store_true", - default=False, - ) - parser.add_argument( - "--int_max", - help="maximum value for integer to be treated as boundless for ops like slice", - type=int, - default=2**31 - 1, - ) - parser.add_argument( - "--guess_output_rank", - help="guess output rank to be the same as input 0 for unknown ops", - action="store_true", - default=False, - ) - parser.add_argument( - "--verbose", - help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed", - type=int, - default=0, - ) - parser.add_argument( - "--save_as_external_data", - help="Saving an ONNX model to external data", - action="store_true", - default=False, - ) - parser.add_argument( - "--all_tensors_to_one_file", - help="Saving all the external data to one file", - action="store_true", - default=False, - ) - parser.add_argument( - "--external_data_location", - help="The file location to save the external file", - default="./", - ) - parser.add_argument( - "--external_data_size_threshold", - help="The size threshold for external data", - type=int, - default=1024, - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_arguments() - logger.info("input model: " + args.input) - if args.output: - logger.info("output model " + args.output) - logger.info("Doing symbolic shape inference...") - out_mp = SymbolicShapeInference.infer_shapes( - onnx.load(args.input), - args.int_max, - args.auto_merge, - args.guess_output_rank, - args.verbose, - ) - if args.output and out_mp: - if args.save_as_external_data: - onnx.save_model( - out_mp, - args.output, - save_as_external_data=True, - all_tensors_to_one_file=args.all_tensors_to_one_file, - location=args.external_data_location, - size_threshold=args.external_data_size_threshold, - convert_attribute=False, - ) - else: - onnx.save(out_mp, args.output) - logger.info("Done!") diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt deleted file mode 100644 index b80f9f4022328703df32af16182ea930645a6db6..0000000000000000000000000000000000000000 --- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# - -onnxsim -packaging -sympy