diff --git a/models/cv/classification/swin_transformer_large/ixrt/README.md b/models/cv/classification/swin_transformer_large/ixrt/README.md
index f09a5c125c0dc5a021aa2d2d1fd25961d08bf02b..032c961d88878d66c4c830cac3b3273545d80295 100644
--- a/models/cv/classification/swin_transformer_large/ixrt/README.md
+++ b/models/cv/classification/swin_transformer_large/ixrt/README.md
@@ -10,8 +10,6 @@ Swin Transformer-Large is a variant of the Swin Transformer, an architecture des
| :----: | :----: | :----: |
| MR-V100 | 4.2.0 | 25.03 |
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
## Model Preparation
### Prepare Resources
@@ -53,6 +51,7 @@ python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/swin-large/s
```bash
git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
export ORIGIN_ONNX_NAME=./swin-large-torch-fp32
export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh b/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh
index b7fe2e695819ea348d0045c5774cc5e7af8037f2..02ac2c462036a5839f09d7448278098897d21679 100644
--- a/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh
+++ b/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh
@@ -26,6 +26,8 @@ else
fi
apt install -y libnuma-dev
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
pip install -r requirements.txt
mkdir -p general_perf/model_zoo/regular
diff --git a/models/nlp/plm/albert/ixrt/README.md b/models/nlp/plm/albert/ixrt/README.md
index 5944c1d15e499710580328e7b981568e83916586..778719bddff35be6d4fc5136b18e6efcc3d96da5 100644
--- a/models/nlp/plm/albert/ixrt/README.md
+++ b/models/nlp/plm/albert/ixrt/README.md
@@ -10,8 +10,6 @@ Albert (A Lite BERT) is a variant of the BERT (Bidirectional Encoder Representat
| :----: | :----: | :----: |
| MR-V100 | 4.2.0 | 25.03 |
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
## Model Preparation
### Prepare Resources
@@ -51,6 +49,7 @@ onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
```bash
git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
export ORIGIN_ONNX_NAME=./albert-torch-fp32-sim
export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/albert/ixrt/ci/prepare.sh b/models/nlp/plm/albert/ixrt/ci/prepare.sh
index 68e8aa19da2132447fdfe6ea48f42bc026f48d7c..9e0dc3b925183fc0ca18848d3dd31cdec4bdf2f1 100644
--- a/models/nlp/plm/albert/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/albert/ixrt/ci/prepare.sh
@@ -21,6 +21,8 @@ apt install -y libnuma-dev
pip3 install -r requirements.txt
cp /root/data/3rd_party/albert-torch-fp32.json ./
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
python3 torch2onnx.py --model_path /root/data/checkpoints/open_albert/albert-base-squad.pt --output_path albert-torch-fp32.onnx
onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
diff --git a/models/nlp/plm/deberta/ixrt/README.md b/models/nlp/plm/deberta/ixrt/README.md
index b683a4cbde82ad5c7fa2d7964824ba2f6489afee..87496848406c894ec31d9886f4bbc6c6123980c1 100644
--- a/models/nlp/plm/deberta/ixrt/README.md
+++ b/models/nlp/plm/deberta/ixrt/README.md
@@ -15,8 +15,6 @@ fine-tuning to better suit specific downstream tasks, thereby improving the mode
| :----: | :----: | :----: |
| MR-V100 | 4.2.0 | 25.03 |
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
## Model Preparation
### Prepare Resources
@@ -55,6 +53,7 @@ python3 remove_clip_and_cast.py
```bash
git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
export ORIGIN_ONNX_NAME=./deberta-sim-drop-clip-drop-invaild-cast
export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/deberta/ixrt/ci/prepare.sh b/models/nlp/plm/deberta/ixrt/ci/prepare.sh
index d440393e7ed913ae6a92fc0ab043a5744086f8c1..23ecd2b5bc02b6076db66490f28ab18efe07b86f 100644
--- a/models/nlp/plm/deberta/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/deberta/ixrt/ci/prepare.sh
@@ -21,6 +21,8 @@ apt install -y libnuma-dev
pip install -r requirements.txt
cp /root/data/3rd_party/deberta-torch-fp32.json ./
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
python3 torch2onnx.py --model_path /root/data/checkpoints/open_deberta/deberta-base-squad.pt --output_path deberta-torch-fp32.onnx
onnxsim deberta-torch-fp32.onnx deberta-torch-fp32-sim.onnx
diff --git a/models/nlp/plm/roberta/ixrt/README.md b/models/nlp/plm/roberta/ixrt/README.md
index acd1b45869ad0103681a7e65488071e52494576f..92cc8e4eb8dfbb8e3490eab6aabaf38134c731b9 100644
--- a/models/nlp/plm/roberta/ixrt/README.md
+++ b/models/nlp/plm/roberta/ixrt/README.md
@@ -17,8 +17,6 @@ our models and code.
| :----: | :----: | :----: |
| MR-V100 | 4.2.0 | 25.03 |
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
## Model Preparation
### Prepare Resources
@@ -62,6 +60,7 @@ onnxsim open_roberta/roberta-torch-fp32.onnx open_roberta/roberta-torch-fp32_sim
```bash
git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
export ORIGIN_ONNX_NAME=./open_roberta/roberta-torch-fp32_sim
export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/roberta/ixrt/ci/prepare.sh b/models/nlp/plm/roberta/ixrt/ci/prepare.sh
index 81d02ab0621e5c06580fe8469fc9c2012ca3c3ee..5f00f9e9ac7096d7d17d9c1a50cd416c6db432de 100644
--- a/models/nlp/plm/roberta/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/roberta/ixrt/ci/prepare.sh
@@ -19,6 +19,8 @@ set -x
apt install -y libnuma-dev
pip install -r requirements.txt
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
mkdir -p data
cp -r /root/data/checkpoints/open_roberta data/
diff --git a/models/nlp/plm/roformer/ixrt/README.md b/models/nlp/plm/roformer/ixrt/README.md
index 890158fa2f42669484032186d1333b1187a9a860..5d37b5e6eb6ac8d7c0ce107ee5b248e64ba96a11 100644
--- a/models/nlp/plm/roformer/ixrt/README.md
+++ b/models/nlp/plm/roformer/ixrt/README.md
@@ -19,8 +19,6 @@ datasets.
| :----: | :----: | :----: |
| MR-V100 | 4.2.0 | 25.03 |
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
## Model Preparation
### Prepare Resources
@@ -68,6 +66,7 @@ python3 deploy.py --model_path ./data/open_roformer/roformer-frozen.onnx --outpu
```bash
git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
export ORIGIN_ONNX_NAME=./data/open_roformer/roformer-frozen
export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/roformer/ixrt/ci/prepare.sh b/models/nlp/plm/roformer/ixrt/ci/prepare.sh
index ea80462db022331cb8b9c20f12a15e9ef8b0bdd6..deda09efeb451ceafa37daf0b0f519e209e9249f 100644
--- a/models/nlp/plm/roformer/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/roformer/ixrt/ci/prepare.sh
@@ -19,6 +19,8 @@ set -x
apt install -y libnuma-dev
pip install -r requirements.txt
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
mkdir -p data
cp -r /root/data/checkpoints/open_roformer data/
diff --git a/models/nlp/plm/roformer/ixrt/export_onnx.py b/models/nlp/plm/roformer/ixrt/export_onnx.py
index 475dddd7c2ab27b6ca342be98ea92d2c791ff60b..a0213bb449c7d632fdda2b43279037d6883f3424 100644
--- a/models/nlp/plm/roformer/ixrt/export_onnx.py
+++ b/models/nlp/plm/roformer/ixrt/export_onnx.py
@@ -16,7 +16,7 @@
import tf2onnx
from tf2onnx import tf_loader
import argparse
-ONNX_OPSET = 11
+ONNX_OPSET = 13
def _convert_graphdef_to_onnx(graph_def,
inputs=None,
diff --git a/models/nlp/plm/videobert/ixrt/README.md b/models/nlp/plm/videobert/ixrt/README.md
index 2f47a69bf90d4bc9d3e04fc17a457e260a6530c4..ded0114471da00dded55f6910c833998411cba4c 100644
--- a/models/nlp/plm/videobert/ixrt/README.md
+++ b/models/nlp/plm/videobert/ixrt/README.md
@@ -12,8 +12,6 @@ and textual information into a unified framework.
| :----: | :----: | :----: |
| MR-V100 | 4.2.0 | 25.03 |
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
## Model Preparation
### Prepare Resources
@@ -43,6 +41,7 @@ pip3 install -r requirements.txt
```bash
git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
export ORIGIN_ONNX_NAME=./general_perf/model_zoo/popular/open_videobert/video-bert
export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/videobert/ixrt/ci/prepare.sh b/models/nlp/plm/videobert/ixrt/ci/prepare.sh
index 0d46c6c023fc58658a230714d3a1b06cc9430c2b..7d5f8fa49779ce6d6b52d088cca2ad0ce4a9dd5a 100644
--- a/models/nlp/plm/videobert/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/videobert/ixrt/ci/prepare.sh
@@ -19,6 +19,8 @@ set -x
apt install -y libnuma-dev
pip install -r requirements.txt
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
mkdir -p data
cp -r /root/data/checkpoints/open_videobert data/
diff --git a/tests/model_info.json b/tests/model_info.json
index e230305fff2fbed1fb15d7f585ed35b9ac71d43d..58adf01b0326d3a63a27d3bf3b1d31ebde2b15d3 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -3003,8 +3003,8 @@
"release_version": "25.03",
"release_sdk": "CoreX 4.2.0",
"release_gpgpu": "MR-V100",
- "latest_sdk": "4.2.0",
- "latest_gpgpu": "MR-V100",
+ "latest_sdk": "",
+ "latest_gpgpu": "",
"category": "cv/classification",
"toolbox": "",
"mdims": "",
@@ -5827,8 +5827,8 @@
"release_version": "24.09",
"release_sdk": "4.1.2",
"release_gpgpu": "MR-V100",
- "latest_sdk": "4.2.0",
- "latest_gpgpu": "MR-V100",
+ "latest_sdk": "",
+ "latest_gpgpu": "",
"category": "nlp/plm",
"toolbox": "",
"mdims": "",
@@ -6025,8 +6025,8 @@
"release_version": "24.09",
"release_sdk": "4.1.2",
"release_gpgpu": "MR-V100",
- "latest_sdk": "4.2.0",
- "latest_gpgpu": "MR-V100",
+ "latest_sdk": "",
+ "latest_gpgpu": "",
"category": "nlp/plm",
"toolbox": "",
"mdims": "",
@@ -6058,8 +6058,8 @@
"release_version": "24.09",
"release_sdk": "4.1.2",
"release_gpgpu": "MR-V100",
- "latest_sdk": "4.2.0",
- "latest_gpgpu": "MR-V100",
+ "latest_sdk": "",
+ "latest_gpgpu": "",
"category": "nlp/plm",
"toolbox": "",
"mdims": "",
@@ -6091,8 +6091,8 @@
"release_version": "24.09",
"release_sdk": "4.1.2",
"release_gpgpu": "MR-V100",
- "latest_sdk": "4.2.0",
- "latest_gpgpu": "MR-V100",
+ "latest_sdk": "",
+ "latest_gpgpu": "",
"category": "nlp/plm",
"toolbox": "",
"mdims": "",
@@ -6124,8 +6124,8 @@
"release_version": "24.09",
"release_sdk": "4.1.2",
"release_gpgpu": "MR-V100",
- "latest_sdk": "4.2.0",
- "latest_gpgpu": "MR-V100",
+ "latest_sdk": "",
+ "latest_gpgpu": "",
"category": "nlp/plm",
"toolbox": "",
"mdims": "",
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index eb25acab7388ad14c509fd48a0862ff0bbec7f32..df6f59e122c6e529d05b1ff7fc20f6ea46fd35e6 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -189,7 +189,7 @@ def run_clf_testcase(model):
script = f"""
cd ../{model['model_path']}
export ORIGIN_ONNX_NAME=./swin-large-torch-fp32
- export OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py
+ export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
export PROJ_PATH=./
bash scripts/infer_swinl_fp16_performance.sh
cd ./ByteMLPerf/byte_infer_perf/general_perf
@@ -450,7 +450,7 @@ def run_nlp_testcase(model):
set -x
cd ../{model['model_path']}
export ORIGIN_ONNX_NAME=./data/open_{model_name}/{model_name}
- export OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py
+ export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
export PROJ_PATH=./
bash scripts/infer_{model_name}_{prec}_performance.sh
cd ./ByteMLPerf/byte_infer_perf/general_perf
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md
deleted file mode 100755
index 3d1318032a7b03971285a05b997d3275c0d3c3cf..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# IxRT optimizer
-
-## 1. optimizer 简介
-
-`optimizer` 是一个 ixrt 中集成的图融合工具,用于将onnx图中的op融合成对应的IxRT plugin,一般与 IxRT 配合进行使用;
-
-## 2. optimizer 功能说明
-
-| 功能 | 说明 |
-| ---------- | ------------------------------------------------------------ |
-| 动态图支持 | 支持融合动态图和静态图 |
-| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert, yolov5s, visionTransformer, gpt2模型,其他模型暂不推荐使用该工具 |
-
-## 3. optimizer 运行参数
-
-| 参数 | 说明 |
-| ---------------- | ------------------------------------------------------------ |
-| `--onnx` | 必选 ,指定要运行的 onnx 模型路径 |
-| `--num_heads` | 可选 ,指定模型对应Attention模块注意力头的个数 |
-| `--hidden_size` | 可选, 模型模型隐藏层的大小 |
-| `--input_shapes` | 可选 ,固定动态模型的输入形状,以从静态形状推理,示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
-| `--dump_onnx` | 可选 ,用于图融合过程中dump出中间的onnx图,生成 _sim 结尾的 onnx 模型 |
-| `--model_type` | 可选 ,可以指定要融合的模型类型,默认是"bert", 可选["bert", "swint", "roformer", "yolo", "gpt2", "vit"] |
-| `--log_level` | 可选 ,指定IxRT运行时显示日志的等级, 可指定为debug、info、error,默认为 info |
-
-
-## 4. 运行示例
-
-### 4.1 示例1:融合albert|videobert|roberta|deberta
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH}
-```
-
-### 4.2 示例2:融合swinL
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
-```
-
-### 4.3 示例3:融合roformer
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
-```
-
-### 4.4 示例4:融合yolov5s
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --model_type yolo
-```
-
-### 4.5 精度验证
-
-#### 4.5.1 示例1:albert模型
-
-模型变量示例:
-
-```
-MODEL_PATH="data/albert/albert-base-squad.onnx"
-MODEL_END_PATH="data/albert/albert-base-squad_end.onnx"
-MODEL_ENGINE_PATH="data/albert/albert-base-squad_end.engine"
-```
-
-运行命令
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --dump_onnx
-ixrtexec --onnx ${MODEL_END_PATH} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
- --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
- --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
- --save_engine ${MODEL_ENGINE_PATH} --log_level verbose --plugins ixrt_plugin
-ixrtexec --load_engine ${MODEL_ENGINE_PATH} --ort_onnx ${MODEL_PATH} --plugins ixrt_plugin --verify_acc
-```
-
-#### 4.5.2 示例2:swinL模型
-
-模型变量示例:
-
-```
-BS=1
-MODEL_PATH="data/swint/swin-transformer-large.onnx"
-MODEL_END_PATH = "data/swint/swin-transformer-large_end.onnx"
-MODEL_ENGINE_PATH = "data/swint/swin-transformer-large_end.engine"
-MODEL_SIM_STATIC_SIM_PATH = "data/swint/swin-transformer-large_sim_static_sim.onnx"
-```
-
-运行命令
-
-```bash
-cd oss/tools/optimizer
-# 固定输入形状为 ${BS}x3x384x384
-python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint --dump_onnx
-
-# Build engine
-ixrtexec --onnx ${MODEL_END_PATH} --save_engine ${MODEL_ENGINE_PATH} --log_level verbose --plugins ixrt_plugin
-
-# 测试性能
-ixrtexec --load_engine ${MODEL_ENGINE_PATH} --plugins ixrt_plugin
-
-# 测试精度
-ixrtexec --load_engine ${MODEL_ENGINE_PATH} --ort_onnx ${MODEL_SIM_STATIC_SIM_PATH} --plugins ixrt_plugin --verify_acc
-```
-
-请参考[高级话题](5_advanced_topics.md)中的精度对比工具一节,了解详细使用方法和原理。
-
-也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
-
-具体使用方法可以参考oss/samples
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
deleted file mode 100644
index de522e5b082b122a28b0a0423a40909598aa82d5..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
deleted file mode 100644
index 65175643c0e50d8445ef65deae088de4600244f0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-## CI Test tool for IxRT
-
-### 1. Install dltest tool
-
- python setup.py develop
-
-### 2. Usage
-
-#### 2.1 Fetch log
-
-Commmand:
-
-```shell
-ixdltest-fetch args_or_pipe ${log_path}
-```
-
-Arguments:
-
-- p or patterns, The pattern of fetch log;
-- pn or pattern_names, The name of pattern;
-- use_re, Whether use regular expression;
-- d or nearest_distance, default=10, The nearest distance of matched pattern;
-- start_flag, The flag of start to record log;
-- end_flag, The flag of stop to record log;
-- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
-- split_sep, The seperator is used to split line;
-- split_idx, The index of split line;
-- saved, Save result to path;
-- log, Log path.
-
-Example
-Analyse from file
-```
-$ ixdltest-fetch run.log -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
-{'results': [{'Throughput': [188.5461778786721]}]}
-- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
-```
-
-Analyse from command line pipe
-```
-$ cat run.log | ixdltest-fetch -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
-{'results': [{'Throughput': [188.5461778786721]}]}
-- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
-```
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
deleted file mode 100644
index 5458f31666f11de72d52a4e834b8a87be9a992d0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .utils.infer_args import show_infer_arguments
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
deleted file mode 100644
index 182e895c7fe902a31fc982fab6f96e0c55125c4a..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import os
-from typing import List, Iterable, Optional
-
-from dltest.cli.log_parser_cli import LogParserCLI
-from dltest.log_parser import LogParser
-from dltest.model_compare_config import get_compare_config_with_full_path
-from dltest.utils.misc import get_full_path
-from dltest.utils.subprocess_tools import get_output
-from dltest.model_compare_config import ComparatorConfig
-
-
-FRAMEWORKS = list(ComparatorConfig.get_frameworks())
-
-REMAINDER = '...'
-
-assertion_expr_factory = dict(
- eq = "a == b",
- ne = "a != b",
- ge = "a >= b",
- le = "a <= b",
- gt = "a > b",
- lt = "a < b",
-)
-
-
-class AssertCLI(LogParserCLI):
-
- def command_name(self):
- return "assert"
-
- def predefine_args(self):
- super(AssertCLI, self).predefine_args()
- self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None,
- help='It is used in assertion expression.')
- self.parser.add_argument('--print_result', action="store_true", default=False,
- help='Whether print result')
- self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'],
- help='The method of capture output')
- # FIXME: Using store_action to replase it
- self.parser.add_argument('--only_last', type=int, default=0,
- help='Whether use the last result to compare')
- self.parser.add_argument('--expr', type=str, default="ge",
- help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" +
- ", or a executable code, such as `a > b`, `a > 1`, ...")
- self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False,
- help='Whether use predefined args of parser.')
- self.parser.add_argument('--log', type=str, default=None, help="Log path")
- self.parser.add_argument("--run_script", default=[], nargs=REMAINDER)
-
- def parse_args(self, *args, **kwargs):
- args = super(AssertCLI, self).parse_args()
- args.only_last = args.only_last > 0
- if len(args.run_script) == 0 and args.log is None:
- raise ValueError("The one of `--run_script` or `--log` must be given.")
-
- if args.assertion_second_value is None:
- if args.expr is None:
- raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.")
-
- if args.expr in assertion_expr_factory:
- raise ValueError(
- "The comparison operators depend on the argument `assertion_second_value`."
- )
-
- return args
-
- def create_parser(self, args):
- if args.use_predefined_parser_rules:
- script_path = self._get_script_path(args.run_script)
- config = get_compare_config_with_full_path(script_path, to_dict=False)
-
- return LogParser(
- patterns=config.patterns, pattern_names=config.pattern_names,
- use_re=config.use_re, nearest_distance=config.nearest_distance,
- start_line_pattern_flag=config.start_line_pattern_flag,
- end_line_pattern_flag=config.end_line_pattern_flag,
- split_pattern=config.split_pattern,
- split_sep=config.split_sep,
- split_idx=config.split_idx
- )
-
- return LogParser(
- patterns=args.patterns, pattern_names=args.pattern_names,
- use_re=args.use_re, nearest_distance=args.nearest_distance,
- start_line_pattern_flag=args.start_flag,
- end_line_pattern_flag=args.end_flag,
- split_pattern=args.split_pattern,
- split_sep=args.split_sep,
- split_idx=args.split_idx
- )
-
- def run(self):
- args = self.parse_args()
- parser = self.create_parser(args)
-
- if args.print_result:
- print(args)
-
- output = self.get_log(args)
- parsed_logs = self.parser_log(parser, output, args)
- self.check_logs(parsed_logs, args)
-
- def get_log(self, args):
- if len(args.run_script) == 0:
- try:
- with open(args.log) as f:
- return f.readlines()
- except:
- print(f"ERROR: Read log fail in {args.log}")
- exit(1)
- else:
- return get_output(args.run_script, capture_output_method=args.capture_output)
-
- def parser_log(self, parser, output, args) -> List[float]:
- results = parser.parse(output)
- if args.only_last:
- results = results[-1:]
-
- if len(results) == 0:
- raise ValueError("The parsed results is empty, please check patterns.")
- if isinstance(results[0], dict):
- if len(results[0]) == 0:
- raise ValueError("The parsed results is empty, please check patterns.")
- key = list(results[0].keys())[0]
- results = [result[key] for result in results]
-
- if isinstance(results[0], Iterable):
- results = [result[0] for result in results]
-
- return results
-
- def check_logs(self, parsed_logs, args):
- if args.print_result:
- print("Parsed result:", parsed_logs)
-
- assertion_expr = assertion_expr_factory.get(args.expr, args.expr)
-
- assert_results = []
- b = args.assertion_second_value
- for a in parsed_logs:
- assert_results.append(eval(assertion_expr))
-
- if args.print_result:
- print("The result of assertion expression:", assert_results)
-
- if any(assert_results):
- print("SUCCESS")
- exit(0)
- print("FAIL")
- exit(1)
-
- def _get_script_path(self, run_script: List[str]):
- # Find shell script by current run_script
- def _find_real_shell_script(cmd: List[str]):
- for i, field in enumerate(cmd):
- if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS:
- return field
-
- real_shell_script = _find_real_shell_script(run_script)
-
- # Find shell script by parent process
- if real_shell_script is None:
- ppid = os.getppid()
- import psutil
- pproc = psutil.Process(ppid)
- pproc_cmd = pproc.cmdline()
- real_shell_script = _find_real_shell_script(pproc_cmd)
-
- if real_shell_script is not None:
- real_shell_script = self._get_script_abs_path(real_shell_script)
- return real_shell_script
-
- raise RuntimeError("The script is not named correctly, " + \
- "please use a script name ending with the framework, " + \
- f"got `{' '.join(run_script)}`, " + \
- "e.g. train_resnet50_torch.sh")
-
- def _get_framework(self, shell_script: str) -> Optional[str]:
- try:
- return shell_script.split('.')[-2].split('_')[-1]
- except:
- return None
-
- def _get_script_abs_path(self, run_script):
- real_run_script = os.path.realpath(run_script)
- if os.path.exists(real_run_script):
- return real_run_script
-
- if "MODEL_DIR" in os.environ:
- return os.path.join(os.environ["MODEL_DIR"], run_script)
-
- if "OLDPWD" in os.environ:
- real_run_script = os.path.join(os.environ["OLDPWD"], run_script)
- if os.path.exists(real_run_script):
- return real_run_script
-
- raise FileNotFoundError("Not found running script path, " + \
- "please set environment variable `MODEL_DIR`, " + \
- "e.g /path/to/deeplearningsamples/executables/resnet.")
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
deleted file mode 100644
index b40f3a72fb949c18104963fb598c58076c65b479..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-
-from .assert_cli import AssertCLI
-from ..utils.subprocess_tools import execute_shell
-
-RUN_MODE_KEY = "RUN_MODE"
-RUN_MODE_STRICT = "strict"
-
-
-class CheckCli(AssertCLI):
-
- def __init__(self, *args, **kwargs):
- super(CheckCli, self).__init__(*args, **kwargs)
- self.args = None
-
- def command_name(self):
- return "check"
-
- def predefine_args(self):
- self.parser.add_argument("--check_mode", type=str, default="no",
- choices=["all", "strict", "nonstrict", "no"],
- help="which running mode needs to be checked")
- self.parser.add_argument("--nonstrict_mode_args", type=str, default="",
- help="the arguments are used with nonstric testing")
- super(CheckCli, self).predefine_args()
-
- def parse_args(self, *args, **kwargs):
- if self.args is None:
- args = super(CheckCli, self).parse_args(*args, **kwargs)
- args.use_predefined_parser_rules = True
- args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ")
-
- if not self.is_strict_testing():
- args.run_script.extend(args.nonstrict_mode_args)
-
- if args.check_mode == "all":
- args.check_mode = self.current_running_mode()
-
- self.args = args
- return self.args
-
- def run(self):
- args = self.parse_args()
- if args.check_mode == self.current_running_mode():
- return super(CheckCli, self).run()
- else:
- res = execute_shell(args.run_script)
- exit(res.returncode)
-
- def current_running_mode(self):
- return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT)
-
- def is_strict_testing(self):
- return self.current_running_mode() == RUN_MODE_STRICT
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
deleted file mode 100644
index c631f332b6a46c43c7891e4925d011e49741dc5d..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-from dltest.cli.assert_cli import AssertCLI
-from dltest.cli.log_comparator_cli import LogComparatorCLI
-from dltest.cli.model_validator_cli import ModelValidatorCLI
-from dltest.cli.fetch_log_cli import FetchLog
-from dltest.cli.check_cli import CheckCli
-
-
-#log_comparator_cli = LogComparatorCLI()
-#model_validator_cli = ModelValidatorCLI()
-fetch_log_cli = FetchLog()
-#assert_cli = AssertCLI()
-#check_cli = CheckCli()
-
-
-def make_execute_path():
- preffix = "dltest.cli.entry_points"
- clis = []
- for cli_var in globals():
- if cli_var.endswith('_cli'):
- cmd_name = globals()[cli_var].command_name()
- clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}")
-
- return clis
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
deleted file mode 100644
index 41f3c3cac3151b61362b3ff57609df0f64896181..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import json
-import sys
-from typing import Mapping
-from os.path import basename, join, exists, expanduser, dirname
-
-from dltest.log_parser import LogParser
-from dltest.cli.log_parser_cli import LogParserCLI
-from dltest.utils.iluvatar import get_iluvatar_card_type, IluvatarGPU
-
-
-
-
-def parse_target(target):
- result = {}
- targets = target.split(",")
- for i in targets:
- item = i.split(":")
- assert len(item) == 2
- key, value = item
- result[key] = float(value)
- return result
-
-
-def load_json(file):
- file_path = expanduser(file)
- # 检查文件是否存在
- if exists(file_path):
- # 加载json文件
- with open(file_path, 'r') as file:
- data = json.load(file)
- else:
- # 创建一个空的json文件
- data = {}
-
- return data
-
-def process_results(results):
- result = dict()
- for i in results["results"]:
- for k, v in i.items():
- result[k] = v[0]
- return result
-
-class FetchLog(LogParserCLI):
-
- def command_name(self):
- return "fetch"
-
- def predefine_args(self):
- super(FetchLog, self).predefine_args()
- self.parser.add_argument('log', nargs='?', type=str, help="Log path")
- self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
- self.parser.add_argument('--saved_entry', type=str, default=None, help='Save to path')
- self.parser.add_argument('-t_bi150','--target_bi150', type=str, default=-1.)
- self.parser.add_argument('-t_mr100','--target_mr100', type=str, default=-1.)
- self.parser.add_argument('-t_mr50','--target_mr50', type=str, default=-1.)
-
- def run(self):
- args = self.parse_args()
- parser = LogParser(
- patterns=args.patterns, pattern_names=args.pattern_names,
- use_re=args.use_re, nearest_distance=args.nearest_distance,
- start_line_pattern_flag=args.start_flag,
- end_line_pattern_flag=args.end_flag,
- split_pattern=args.split_pattern,
- split_sep=args.split_sep,
- split_idx=args.split_idx
- )
-
- results = parser.parse(args.log)
- if not isinstance(results, Mapping):
- results = dict(results=results)
- results = process_results(results)
- print(results)
-
- if args.saved is not None:
- saved = load_json(args.saved)
- if not args.saved_entry:
- raise Exception("You need to use --saved_entry to specify entry name of the result")
-
- saved[args.saved_entry] = results
- with open(args.saved, 'w') as f:
- json.dump(saved, f, indent=4)
- self.compare_results(args, results)
-
-
- def compare_results(self, args, results):
- card = get_iluvatar_card_type()
- if card == IluvatarGPU.UNKNOWN:
- print("Not known which card is used, can you use ixsmi in the environment?")
- return
- user_target = getattr(args, 'target_'+card.name.lower(), "")
- user_target = parse_target(user_target)
-
- is_expected = True
- for key, target in user_target.items():
- if key not in results:
- continue
- if results[key]={target}")
- if not is_expected:
- sys.exit(1)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
deleted file mode 100644
index cac8a0a684440371ece5067086cd75eed939f482..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import json
-from pprint import pprint
-
-from dltest.cli.log_parser_cli import LogParserCLI
-from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS
-
-
-class LogComparatorCLI(LogParserCLI):
-
- def command_name(self):
- return "compare"
-
- def predefine_args(self):
- super(LogComparatorCLI, self).predefine_args()
- self.parser.add_argument('--log1', type=str, help="First log")
- self.parser.add_argument('--log2', type=str, help="Second log")
- self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold")
- self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare')
- self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
- self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
- self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2')
-
- def parse_args(self, *args, **kwargs):
- args = super(LogComparatorCLI, self).parse_args(*args, **kwargs)
- args.only_last = args.only_last >= 1
-
- return args
-
- def run(self):
- args = self.parse_args()
- satisfied, results = compare_logs_with_paths(
- log1=args.log1, log2=args.log2,
- threshold=args.threshold,
- patterns=args.patterns, pattern_names=args.pattern_names,
- use_re=args.use_re, nearest_distance=args.nearest_distance,
- start_line_pattern_flag=args.start_flag,
- end_line_pattern_flag=args.end_flag,
- only_last=args.only_last,
- split_pattern=args.split_pattern,
- split_sep=args.split_sep,
- split_idx=args.split_idx,
- allow_greater_than=True
- )
-
- if args.print_result:
- pprint(results)
-
- if satisfied:
- print("SUCCESS")
- else:
- print("FAIL")
-
- if args.saved is not None:
- with open(args.saved, 'w') as f:
- json.dump(results, f)
-
-
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
deleted file mode 100644
index d2e2dd1be2d305a83a2969b5d4dbfbfeef2d9fd0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import json
-from typing import Mapping
-
-from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
-from dltest.utils.base_cli import BaseCLI
-
-
-class LogParserCLI(BaseCLI):
-
- def predefine_args(self):
- self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns')
- self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern')
- self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression')
- self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern')
- self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log')
- self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log')
- self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line')
- self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line')
- self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line')
-
- def parse_args(self, *args, **kwargs):
- args = super(LogParserCLI, self).parse_args(*args, **kwargs)
-
- return args
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
deleted file mode 100644
index 8d0d77d97d8f4f0d4d3528418c886884fa262575..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import json
-import os
-import os.path as ospath
-from pprint import pprint
-from typing import List, Union
-
-from dltest.utils.base_cli import BaseCLI
-from dltest.utils.get_env import get_gpu_type
-from dltest.utils.misc import get_full_path
-from dltest.model_compare_config import get_compare_config_with_full_path
-from dltest.log_comparator import compare_logs_with_paths
-from dltest.utils.subprocess_tools import get_output
-
-
-REMAINDER = '...'
-
-
-class ModelValidatorCLI(BaseCLI):
-
- def command_name(self):
- return "validate"
-
- def predefine_args(self):
- super(ModelValidatorCLI, self).predefine_args()
- self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log")
- self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
- self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared")
- self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
- self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output')
- self.parser.add_argument("run_script", nargs=REMAINDER)
-
- def parse_args(self, *args, **kwargs):
- args = super(ModelValidatorCLI, self).parse_args()
- if len(args.run_script) == 0:
- print("ERROR: Invalid run_script")
- exit(1)
-
- return args
-
- def run(self):
- args = self.parse_args()
- output = self._run_script(args.run_script, capture_output_method=args.capture_output)
- self.compare_logs(
- output, args.compare_log, args.run_script,
- args.saved, args.with_exit_code,
- args.print_result
- )
-
- def compare_logs(self, output: List, compare_log: str,
- run_script: List[str], saved: str=None,
- with_exit_code: int=1, print_result=False):
- script_path = self._get_script_path(run_script)
- script_path = get_full_path(script_path)
- compare_args = get_compare_config_with_full_path(script_path)
-
- if compare_log is None:
- epoch = self._get_epoch(run_script)
- script_name = ospath.basename(script_path)
- dist_tag = self._get_dist_tag(script_name)
- compare_log = self._find_comparable_log(script_path, epoch, dist_tag)
-
- if not ospath.exists(compare_log):
- print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.")
- exit(1)
-
- compare_args['log1'] = output
- compare_args['log2'] = compare_log
-
- satisfied, results = compare_logs_with_paths(**compare_args)
-
- if print_result:
- pprint(results)
-
- if satisfied:
- print("SUCCESS")
- else:
- print("FAIL")
-
- if saved is not None:
- with open(saved, 'w') as f:
- json.dump(results, f)
-
- if with_exit_code:
- if satisfied:
- exit(0)
- else:
- exit(1)
-
- def _run_script(self, command: List, capture_output_method: str='tempfile'):
- return get_output(command, capture_output_method=capture_output_method)
-
- def _get_script_path(self, run_script: List[str]):
- for i, field in enumerate(run_script):
- if field.endswith('.py') or field.endswith('.sh'):
- return field
-
- raise RuntimeError("Not found the name of script, " +
- "only support python or `sh` script, but got {}.".format(run_script))
-
- def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str):
- gpu_type = get_gpu_type().lower()
-
- # Get the platform of trained log
- if gpu_type == "nv":
- gpu_type = 'bi'
- else:
- gpu_type = 'nv'
-
- script_path = get_full_path(script_path)
- project_dir = self._get_project_dir(script_path)
- script_name = ospath.basename(script_path)
-
- log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log"
-
- return log_path
-
-
- def _get_epoch(self, run_script: List[str]):
- for i, field in enumerate(run_script):
- if "--epoch" in field:
- if "=" in field:
- return field.split("=")[1]
- else:
- return run_script[i + 1]
-
- return 'default'
-
- def _get_dist_tag(self, script_name: str):
- try:
- import torch
- num_gpus = torch.cuda.device_count()
- except:
- num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all")
-
- if '_dist_' in script_name or '_multigpu_' in script_name:
- return f".{num_gpus}card"
- return ""
-
- def _get_project_dir(self, abs_path):
- abs_path = ospath.abspath(abs_path)
- script_dir = ospath.dirname(abs_path)
- executables_dir = ospath.dirname(script_dir)
- return ospath.dirname(executables_dir)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
deleted file mode 100644
index 9da2c0cd579a3407b6d743bfd2a4cdbbd28a687c..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-from typing import List, Mapping, Union, Tuple
-from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
-
-LogLines = List[Mapping]
-CompareResult = Tuple[bool, Union[List, Mapping]]
-
-
-def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult:
- if not isinstance(threshold, Mapping):
- _thds = dict()
- for key in value1.keys():
- _thds[key] = threshold
- threshold = _thds
-
- result = dict()
- satisfied = True
- for key, _thd in threshold.items():
- v1, v2 = value1[key], value2[key]
- origin_value_type = list
- if not isinstance(v1, (tuple, list)):
- origin_value_type = float
- v1 = [v1]
- v2 = [v2]
-
- real_errors = []
- for v1_i, v2_i in zip(v1, v2):
- real_error = v1_i - v2_i
- real_errors.append(real_error)
- if satisfied and abs(real_error) > _thd:
- if allow_greater_than and real_error > 0:
- continue
- satisfied = False
-
- if origin_value_type is float and len(real_errors) > 0:
- real_errors = real_errors[0]
-
- result[key] = real_errors
-
- return satisfied, result
-
-
-def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
- total_lines = len(log1[0])
- real_errors = []
- satisfied = True
- for line_idx in range(total_lines):
- _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than)
- real_errors.append(_error)
- if satisfied and not _satisfied:
- satisfied = False
-
- return satisfied, real_errors
-
-
-def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
- if len(log1) == 0 or len(log2) == 0:
- return False, []
- return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than)
-
-
-def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping],
- patterns: List[str],
- pattern_names: List[str] = None,
- use_re: bool = False,
- nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS,
- start_line_pattern_flag: str = None,
- end_line_pattern_flag: str = None,
- only_last: bool=True,
- split_pattern: Union[str, List] = None,
- split_sep: List = None,
- split_idx: List = None,
- allow_greater_than: bool = False):
- parser = LogParser(
- patterns=patterns, pattern_names=pattern_names,
- use_re=use_re, nearest_distance=nearest_distance,
- start_line_pattern_flag=start_line_pattern_flag,
- end_line_pattern_flag=end_line_pattern_flag,
- split_pattern=split_pattern,
- split_sep=split_sep,
- split_idx=split_idx
- )
-
- log1 = parser.parse(log1)
- log2 = parser.parse(log2)
-
- if only_last:
- compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than)
- else:
- compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than)
-
- return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
deleted file mode 100644
index 3c690d8f677b3ae470322e29c266e84993a74266..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-from typing import List, Optional, Union, Mapping
-import re
-import sys
-
-
-DEFAULT_NEAREST_MATCH_CHARS = 10
-
-
-def read_file(file):
- with open(file, 'r') as f:
- return f.readlines()
-
-def read_pipe():
- result = []
- for line in sys.stdin:
- result.append(line)
- return result
-
-def postprocess_search_result(results: List[str]) -> List[float]:
- if len(results) != 0:
- results = list(map(float, results))
- return results
-
-
-def extract_nearest_value_by_key_inline(content: str, key: str,
- nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
- pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance)
- return extract_value_by_pattern_inline(content, pattern)
-
-
-def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]:
- results = re.findall(pattern, content)
- return postprocess_search_result(results)
-
-
-def extract_value(content: str, pattern: str,
- inline=True, use_re=False,
- nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
- if inline:
- if use_re:
- return extract_value_by_pattern_inline(content, pattern)
- else:
- return extract_nearest_value_by_key_inline(content, pattern, nearest_distance)
- else:
- raise NotImplementedError()
-
-
-class LogParser:
-
- def __init__(self,
- patterns: List[str]=None,
- pattern_names: List[str]=None,
- use_re: bool=False,
- nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS,
- start_line_pattern_flag: str=None,
- end_line_pattern_flag: str=None,
- split_pattern: Union[str, List]=None,
- split_sep: List[str]=None,
- split_idx: List[int]=None):
- if patterns is None and split_sep is None:
- raise ValueError("The one of argument `patterns` or `split_sep` must be given.")
-
- if pattern_names is not None:
- if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names):
- raise ValueError("The length of `pattern_names` argument not equal to `patterns`.")
- if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names):
- raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.")
-
- if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))):
- raise ValueError("Invalid index to split text, got {}.".format(split_idx))
-
- if split_sep is not None and split_pattern is None:
- raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern))
-
- self.patterns = patterns
- self.use_re = use_re
- self.nearest_distance = nearest_distance
- self.start_line_pattern_flag = start_line_pattern_flag
- self.end_line_pattern_flag = end_line_pattern_flag
-
- if not isinstance(split_sep, (tuple, list)) and split_sep is not None:
- split_sep = [split_sep]
-
- if not isinstance(split_idx, (tuple, list)):
- split_idx = [split_idx]
-
- self.split_sep = split_sep
- self.split_idx = split_idx
-
- if pattern_names is None:
- if patterns is None:
- pattern_names = split_idx
- else:
- pattern_names = patterns
- self.pattern_names = pattern_names
-
- if not isinstance(split_pattern, (tuple, list)) and split_sep is not None:
- split_pattern = [split_pattern] * len(split_sep)
- self.split_pattern = split_pattern
-
- self.start_record = start_line_pattern_flag is None
-
- def parse(self, path_or_logs: Union[str, List]) -> List[dict]:
- """
- : return: [{matric_name: value}, ...]
- """
-
-
- if path_or_logs:
- path_or_logs = read_file(path_or_logs)
- else:
- path_or_logs = read_pipe()
-
- ret = []
- for line in path_or_logs:
- result = self.parse_inline(line)
- if len(result) == 0:
- continue
- ret.append(result)
- return ret
-
- def parse_inline(self, line) -> dict:
- if not self.can_record(line):
- return {}
-
- if self.split_sep is None:
- return self._parse_inline_by_match(line)
- return self._parse_inline_by_split(line)
-
- def _parse_inline_by_match(self, line: str):
- ret = {}
- for name, pattern in zip(self.pattern_names, self.patterns):
- result = extract_value(
- line, pattern, inline=True, use_re=self.use_re,
- nearest_distance=self.nearest_distance
- )
- if len(result) == 0:
- continue
- ret[name] = result
- return ret
-
- def _parse_inline_by_split(self, line: str, to_type=float):
- ret = {}
- for name, sep, idx, pattern in zip(self.pattern_names,
- self.split_sep,
- self.split_idx,
- self.split_pattern):
- if not self.can_matched(line, pattern):
- continue
- if '\t' in sep:
- segs = line.strip().split(sep)
- else:
- segs = line.strip().replace('\t', ' ').split(sep)
- segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs))
- if len(segs) <= idx:
- continue
- ret[name] = to_type(segs[idx])
- return ret
-
- def can_record(self, line: str):
- if self.start_line_pattern_flag is None:
- self.start_record = True
- elif not self.start_record:
- self.start_record = self.can_matched(line, self.start_line_pattern_flag)
-
- if self.start_record:
- if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag):
- self.start_record = False
-
- return self.start_record
-
- def can_matched(self, content: str, pattern: str):
- result = re.findall(pattern, content)
- return len(result) != 0
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
deleted file mode 100644
index ab7c60d3a6f0758bdac30b12fe82c83dab6cd520..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import os.path as ospath
-
-from typing import NamedTuple, Union, List, Mapping
-
-from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS
-
-
-class LogComparatorArgs(NamedTuple):
- threshold: Union[float, Mapping]
- patterns: List[str] = None
- pattern_names: List[str] = None
- use_re: bool = False
- nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS
- start_line_pattern_flag: str = None
- end_line_pattern_flag: str = None
- split_pattern: Union[str, List] = None
- split_sep: List = None
- split_idx: List = None
- only_last: bool = True
- allow_greater_than: bool = True
-
- def to_dict(self):
- return self._asdict()
-
-
-class ArgsModelsTuple(NamedTuple):
-
- args: LogComparatorArgs
- models: List[str]
-
-
-class BaseConfig:
-
- def __getitem__(self, item):
- return self.__class__.__dict__[item]
-
- def __getattr__(self, item):
- return self.__class__.__dict__[item]
-
- def __iter__(self):
- for attr, value in self.__class__.__dict__.items():
- if isinstance(value, ArgsModelsTuple):
- yield attr
-
- def iter_items(self):
- for attr, value in self.__class__.__dict__.items():
- if isinstance(value, ArgsModelsTuple):
- yield attr, value
-
-
-class _TFComparatorConfig(BaseConfig):
-
- cnn_benchmarks = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="],
- pattern_names=["Acc@1", "Acc@5"]
- ),
- models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"]
- )
-
- dist_cnn_becnmarks = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- split_sep=[' ', ' '],
- split_idx=[9, 10],
- split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter",
- pattern_names=['Acc@1', 'Acc@5']
- ),
- models=[
- "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist"
- ]
- )
-
- bert = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=["eval_accuracy ="],
- pattern_names=["Accuracy"]
- ),
- models=["bert"]
- )
-
- ssd = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=["acc="],
- pattern_names=["Acc@1"]
- ),
- models=["ssd"]
- )
-
- yolov3 = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.8,
- patterns=["mAP"]
- ),
- models=["yolov3"]
- )
-
- vnet = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=["background_dice", "anterior_dice", "posterior_dice"]
- ),
- models=["vnet"]
- )
-
-
-class _TorchComparatorConfig(BaseConfig):
- classification = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=8.0, patterns=['Acc@1', 'Acc@5'],
- start_line_pattern_flag="Start training",
- ),
- models=[
- 'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2',
- 'vgg', 'resnet50_dali', 'resnext', 'densenet'
- ]
- )
-
- detection = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.03,
- patterns=[
- "Average Precision \(AP\) @\[ IoU=0.50:0.95 \| area= all \| maxDets=100 \] ="
- ],
- pattern_names=["mAP"],
- start_line_pattern_flag="IoU metric: bbox",
- end_line_pattern_flag="IoU metric: segm"
- ),
- models=[
- 'maskrcnn', 'retinanet', 'ssd'
- ]
- )
-
- bert_cola = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=['mcc']
- ),
- models=['bert_cola']
- )
-
- bert_mrpc = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=['acc']
- ),
- models=['bert_mrpc']
- )
-
- bert_pretrain_apex = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=['eval_mlm_accaracy']
- ),
- models=['bert_pretrain_apex']
- )
-
- segmentation = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=8.0,
- patterns=['mean IoU:'],
- pattern_names=['mIoU']
- ),
- models=[
- 'deeplabv3', 'fcn'
- ]
- )
-
- t5 = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=5.0,
- split_pattern="eval_bleu[\s\S]*?=",
- split_sep=["="],
- split_idx=[1],
- pattern_names=['EvalBleu']
- ),
- models=['t5']
- )
-
- yolov3 = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=["mAP"]
- ),
- models=['yolov3']
- )
-
- yolov5 = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- patterns=[
- "Average Precision \(AP\) @\[ IoU=0.50:0.95 \| area= all \| maxDets=100 \] ="
- ],
- pattern_names=["mAP"],
- ),
- models=['yolov5'],
- )
-
- yolov5s_coco128 = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
- split_sep=[" ", " "],
- split_idx=[5, 6],
- pattern_names=["AP50", "mAP"]
- ),
- models=['yolov5s_coco128']
- )
-
- centernet_resnet18 = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
- split_sep=[" ", " "],
- split_idx=[5, 6],
- pattern_names=["AP50", "mAP"]
- ),
- models=['centernet_resnet18']
- )
-
- fcos_resnet50_fpn = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.08,
- split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
- split_sep=[" ", " "],
- split_idx=[5, 6],
- pattern_names=["AP50", "mAP"]
- ),
- models=['fcos_resnet50_fpn']
- )
-
- ocr_recognition = ArgsModelsTuple(
- args=LogComparatorArgs(
- threshold=0.5, patterns=["0_word_acc"],
- ),
- models=[
- "sar", "satrn"
- ]
- )
-
-
-
-class ComparatorConfig:
-
- _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig())
-
- @classmethod
- def get_frameworks(cls) -> List:
- return list(cls._configs.keys())
-
- @classmethod
- def get(cls, tf_or_torch, name, default=None):
- for model_kind, comb in cls._configs[tf_or_torch].iter_items():
- if name in comb.models:
- return comb.args
- if default is not None:
- return default
- raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch))
-
- @classmethod
- def find_config(cls, script_path: str) -> LogComparatorArgs:
- tf_or_torch = script_path.split('.')[-2].split('_')[-1]
-
- # Find by the name of script
- script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0]
- if script_name.startswith('train_'):
- script_name = script_name.replace("train_", "", 1)
- while script_name not in [None, "", "/", "\\"]:
- try:
- config = cls.get(tf_or_torch, script_name)
- return config
- except:
- pass
- script_name = script_name.rsplit('_', maxsplit=1)
- if len(script_name) <= 1:
- break
- script_name = script_name[0]
-
- # Find by the name of model's dir
- model_dir_name = ospath.basename(ospath.dirname(script_path))
- try:
- config = cls.get(tf_or_torch, model_dir_name)
- return config
- except:
- raise RuntimeError("Not found for", script_path)
-
-
-def get_compare_config_with_full_path(script_path: str, to_dict=True):
- config = ComparatorConfig.find_config(script_path)
- if to_dict:
- return config.to_dict()
- return config
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
deleted file mode 100644
index 35f7efa99b21179da30ce34f412fa3319ea1ba00..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-from argparse import ArgumentParser
-from abc import abstractmethod
-
-
-class BaseCLI:
-
- def __init__(self, parser=None, *args, **kwargs):
- if parser is None:
- self.parser = ArgumentParser(description=self.description ,*args, **kwargs)
-
- def __call__(self):
- self.run()
-
- @property
- def description(self):
- return None
-
- @abstractmethod
- def command_name(self):
- pass
-
- def predefine_args(self):
- pass
-
- def parse_args(self, *args, **kwargs):
- self.predefine_args()
- return self.parser.parse_args(*args, **kwargs)
-
- @abstractmethod
- def run(self):
- pass
-
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
deleted file mode 100644
index 97407f37bd9d8a4c5e0a68c760a561ec03a29f95..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import os
-from collections import defaultdict
-import os.path as osp
-import subprocess
-import sys
-
-
-def get_envinfo():
- import torch
- env_info = {}
- env_info['sys.platform'] = sys.platform
- env_info['Python'] = sys.version.replace('\n', '')
-
- cuda_available = torch.cuda.is_available()
- env_info['CUDA available'] = cuda_available
- if cuda_available:
- from torch.utils.cpp_extension import CUDA_HOME
- env_info['CUDA_HOME'] = CUDA_HOME
- if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
- try:
- nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
- nvcc = subprocess.check_output(
- f'"{nvcc}" -V | tail -n1', shell=True)
- nvcc = nvcc.decode('utf-8').strip()
- except subprocess.SubprocessError:
- nvcc = 'Not Available'
- env_info['NVCC'] = nvcc
-
- devices = defaultdict(list)
- for k in range(torch.cuda.device_count()):
- devices[torch.cuda.get_device_name(k)].append(str(k))
- for name, devids in devices.items():
- env_info['GPU ' + ','.join(devids)] = name
-
- gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
- gcc = gcc.decode('utf-8').strip()
- env_info['GCC'] = gcc
-
- env_info['PyTorch'] = torch.__version__
-
- return env_info
-
-
-def get_gpu_type():
- import torch
- if "DEBUG_GPU_TYPE" in os.environ:
- return os.environ["DEBUG_GPU_TYPE"]
-
- if not torch.cuda.is_available():
- return "BI"
- dev_name = torch.cuda.get_device_name(0)
- if 'IX BI' in dev_name or getattr(torch, "corex", False):
- _type = "BI"
- else:
- _type = "NV"
-
- return _type
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
deleted file mode 100644
index 7328dd737c2720d544027ad1822d3c2007656a8e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import sys
-import subprocess
-from enum import Enum
-
-__all__ = ["get_iluvatar_card_type", "IluvatarGPU"]
-
-class IluvatarGPU(Enum):
- UNKNOWN = -1
- MR50 = 0
- MR100 = 1
- BI150 = 2
-
-card_ixsmi_names = {
- "BI150": IluvatarGPU.BI150,
- "BI-V150": IluvatarGPU.BI150,
- "MR100": IluvatarGPU.MR100,
- "MR-V100": IluvatarGPU.MR100,
- "MR50": IluvatarGPU.MR50,
- "MR-V50": IluvatarGPU.MR50,
-}
-
-def get_iluvatar_card_type():
- command = 'ixsmi -L | grep "GPU \{1,\}0"'
- result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
- if result.returncode == 0:
- for key, value in card_ixsmi_names.items():
- if key in result.stdout:
- return value
- else:
- return IluvatarGPU.UNKNOWN
- else:
- return IluvatarGPU.UNKNOWN
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
deleted file mode 100644
index 29760001cab2d9a8cbeecc894e9e3344ad00d2b4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import os
-
-from typing import Union, List, Dict, Any, Mapping
-from argparse import Namespace, ArgumentParser
-import json
-
-
-def _obj_to_dict(obj) -> Dict:
- if isinstance(obj, Mapping):
- return obj
-
- try:
- from absl import flags
- if isinstance(obj, flags.FlagValues):
- return obj.flag_values_dict()
- except:
- pass
- if isinstance(obj, Namespace):
- return obj.__dict__
- elif isinstance(obj, List):
- new_obj = dict()
- for _o in obj:
- _o_dict = _obj_to_dict(_o)
- new_obj.update(_o_dict)
- return new_obj
- elif not isinstance(obj, Dict):
- if hasattr(obj, "__dict__"):
- return obj.__dict__
- try:
- typename = type(obj).__name__
- except:
- typename = str(obj)
- return {typename: str(obj)}
-
-
-def json_dump_obj(o):
- if hasattr(o, "__name__"):
- return o.__name__
- return str(o)
-
-
-def show_infer_arguments(args: Union[List, Dict, Any]):
- """ print running arguments
- Example 1: For ArgumentParser
- >>> parser = ArgumentParser("Test")
- >>> parser.add_argument("--arg0", type=str)
- >>> args = parser.parse_args()
- >>> show_infer_arguments(args)
-
- Example 2: For dict
- >>> args = dict(arg=1)
- >>> show_infer_arguments(args)
-
- Example 3: For custom object
- >>> from collections import namedtuple
- >>> ArgsType = namedtuple("ArgsType", ["arg"])
- >>> args = ArgsType(arg=123)
- >>> show_infer_arguments(args)
-
- Example 4: For absl
- >>> from absl import flags
- >>> flags.DEFINE_string("arg", "123", "test")
- >>> show_infer_arguments(flags.FLAGS)
-
- Example 5: For multi args
- >>> args1 = dict(a=1)
- >>> args2 = dict(b=2)
- >>> show_infer_arguments([args1, args2])
-
- """
- if not "SHOW_RUNNING_ARGS" in os.environ:
- return
-
- if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]:
- return
-
- if "LOCAL_RANK" in os.environ:
- if os.environ["LOCAL_RANK"] != "0":
- return
- args = _obj_to_dict(args)
- args = json.dumps(args, default=json_dump_obj)
- print("[RunningArguments]", args)
-
-
-if __name__ == '__main__':
- os.environ["SHOW_RUNNING_ARGS"] = "1"
- show_infer_arguments([dict(a=1), dict(b=1), object()])
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
deleted file mode 100644
index 457bdb3ee2aab7d98faa5567856e8fa923589e0a..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import copy
-import os
-
-
-def get_full_path(fname):
- pwd = os.getcwd()
- if fname.startswith('/'):
- return fname
- return os.path.join(pwd, fname)
-
-
-def is_main_proc(rank):
- return str(rank) in ["None", "-1", "0"]
-
-
-def main_proc_print(*args, **kwargs):
- if "RANK" in os.environ:
- if is_main_proc(os.environ["RANK"]):
- print(*args, **kwargs)
- return
-
- if "LOCAL_RANK" in os.environ:
- if is_main_proc(os.environ["LOCAL_RANK"]):
- print(*args, **kwargs)
- return
-
- print(*args, **kwargs)
-
-
-def create_subproc_env():
- env = copy.copy(os.environ)
- env["USE_DLTEST"] = "1"
- return env
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
deleted file mode 100644
index a9883213f4f44d8253986e91c64f4015c66d6ec4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import os
-import os.path as ospath
-from pathlib import Path
-import tempfile
-
-
-class TemporaryFile:
-
- def __init__(self, with_open=False, mode='r'):
- self.name = None
- self.with_open = with_open
- self.mode = mode
-
- self.file = None
-
- def create(self):
- self.name = tempfile.mktemp()
- file_path = Path(self.name)
- file_path.touch()
-
- def delete(self):
- if self.name is not None and ospath.exists(self.name):
- os.unlink(self.name)
-
- def read(self):
- self._check_file_status()
- return self.file.read()
-
- def readlines(self):
- self._check_file_status()
- return self.file.readlines()
-
- def _check_file_status(self):
- if self.file is None:
- raise RuntimeError("File is closed, please reopen it.")
-
- def __enter__(self):
- self.create()
- if self.with_open:
- self.file = open(self.name, mode=self.mode)
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- if self.with_open:
- self.file.close()
- self.delete()
-
-
-
-
-
-
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
deleted file mode 100644
index 8c5de879b0470d29e208368f1681df8469dcf488..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import subprocess
-from typing import Callable, Union, List
-
-from dltest.utils.real_tempfile import TemporaryFile
-from dltest.utils import misc
-
-
-def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs):
- if shell is None:
- shell = True
-
- if shell and not isinstance(command, str):
- command = " ".join(command)
-
- stream = subprocess.Popen(
- command, shell=shell,
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- *args, **kwargs
- )
- outputs = []
- while 1:
- exit_code = stream.poll()
- if exit_code is None:
- if stream.stdout.readable():
- outputs.append(stream.stdout.readline().decode("utf8").rstrip())
- if callback is not None:
- callback(outputs[-1:])
- print(outputs[-1])
- else:
- if stream.stdout.readable():
- lines = stream.stdout.readlines()
- lines = [line.decode("utf8".rstrip()) for line in lines]
- outputs.extend(lines)
- if callback is not None:
- callback(outputs[-1:])
- print('\n'.join(lines))
- break
-
- return outputs
-
-
-def get_output_with_tempfile(command, *args, **kwargs):
- if not isinstance(command, (list, tuple)):
- command = [command]
- stdout = None
- with TemporaryFile(with_open=True) as file:
- command.extend(['|', 'tee', file.name])
- command = " ".join(command)
-
- res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs)
- output = file.readlines()
-
- return output
-
-def execute_shell(command, *args, **kwargs):
- if "env" not in kwargs:
- kwargs["env"] = misc.create_subproc_env()
-
- if not isinstance(command, (list, tuple)):
- command = [command]
-
- command = " ".join(command)
- res = subprocess.run(command,
- shell=True, *args, **kwargs)
- return res
-
-def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs):
- if "env" not in kwargs:
- kwargs["env"] = misc.create_subproc_env()
-
- if capture_output_method == "tempfile":
- return get_output_with_tempfile(command, *args, **kwargs)
- return get_output_with_pipe(command, *args, **kwargs)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
deleted file mode 100644
index 2e4fa4eea09fa2cdf51b02619d56fe5fcced869f..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-from setuptools import setup, find_packages
-from dltest.cli.entry_points import make_execute_path
-
-setup(
- name="dltest",
- version="0.1",
- description='Iluvatar Corex AI Toolbox',
- packages=find_packages(exclude=('examples')),
- include_package_data=True,
- zip_safe=False,
- entry_points = {
- 'console_scripts': make_execute_path(),
- },
- install_requires=[
- 'psutil'
- ]
-)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py
deleted file mode 100644
index 3a9c0ca081a1b44c00b0909c2b69c0e5a00c1e6a..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py
+++ /dev/null
@@ -1,593 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
- FusionCustomFC,
- FusionCustomFCActivation,
- FusionCustomFCGPT2,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
- FusionFormatInvalidMask,
- FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
- FusionBiasSkipLayerNormalization,
- FusionSkipLayerNormalization,
-)
-
-from passes.fusion_utils import FusionUtils
-
-from passes.fusion_conv_reformat import FusionConvReformat
-
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.fusion_PVT_attention import FusionPVTAttention
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class PVTOptimizationOptions(FusionOptions):
- """This class is deprecated"""
-
- def __init__(self, model_type):
- logger.warning(
- f"PVTOptimizationOptions is depreciated. Please use FusionOptions instead."
- )
- super().__init__(model_type)
-
-
-class PVTOnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
- """Initialize BERT ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
-
- super().__init__(model)
- self.num_heads = num_heads
- self.hidden_size = hidden_size
-
- self.attention_mask = AttentionMask(self)
- self.attention_fusion = FusionAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.qordered_attention_fusion = FusionQOrderedAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.utils = FusionUtils(self)
-
- def fuse_attention(self):
- self.attention_fusion.apply()
- FusionAlbertAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- ).apply()
- # FusionVideoBertAttention(self).apply()
- # FusionVITAttention(self).apply()
- # FusionSwinLAttention(self).apply()
- # FusionGptAttentionNoPast(self).apply()
- FusionPVTAttention(self).apply()
- # Only relevant in models with Q-DQ nodes
- self.qordered_attention_fusion.apply()
-
- def fuse_format_roformer(self):
- FusionRemoveUselessElementwise(self).apply()
- fusion = FusionFormatInvalidMask(self)
- fusion.apply()
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_activation(self):
- fusion = FusionCustomFCActivation(self)
- fusion.apply()
-
- def fuse_custom_fc_gpt2_classify(self):
- fusion = FusionCustomFCGPT2(self)
- fusion.apply()
-
- def fuse_swinT_serial_bias_add(self):
- fusion = FusionSerialBiasAdd(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_bias_gelu(self, is_fastgelu):
- fusion = FusionBiasGelu(self, is_fastgelu)
- fusion.apply()
-
- def fuse_custom_xsoftmax(self):
- fusion = FusionXSoftmax(self)
- fusion.apply()
-
- def fuse_disentangled_attention(self):
- fusion = FusionDisentangledAttention(self)
- fusion.apply()
-
- def gelu_approximation(self):
- fusion = FusionGeluApproximation(self)
- fusion.apply()
-
- def fuse_add_bias_skip_layer_norm(self):
- fusion = FusionBiasSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_embed_layer(self):
- fusion = FusionEmbedLayerNormalization(self)
- fusion.apply()
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalization(self, self.hidden_size)
- fusion.apply()
-
- fusion = FusionLayerNormalizationTF(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedLayerNormalization(self)
- fusion.apply()
-
- def fuse_skip_layer_norm(self):
- fusion = FusionSkipLayerNormalization(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- def fuse_qordered_mamtul(self):
- fusion = FusionQOrderedMatMul(self)
- fusion.apply()
-
- def conv_reformat(self):
- fusion = FusionConvReformat(self)
- fusion.apply()
-
-
-
- def get_graph_inputs_from_node_type(
- self, op_type: str, input_indices: List[int], casted: bool
- ):
- """
- Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
- Returns a list of the graph input names based on the filter whether it is casted or not.
- """
- graph_inputs = []
-
- output_name_to_node = self.output_name_to_node()
- nodes = self.get_nodes_by_op_type(op_type)
- for node in nodes:
- bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
- for bert_input in bert_inputs:
- if self.find_graph_input(bert_input):
- if not casted:
- graph_inputs.append(bert_input)
- elif bert_input in output_name_to_node:
- parent = output_name_to_node[bert_input]
- if (
- parent.op_type == "Cast"
- and self.find_graph_input(parent.input[0]) is not None
- ):
- if casted:
- graph_inputs.append(parent.input[0])
- return graph_inputs
-
- def get_graph_inputs_from_fused_nodes(self, casted: bool):
- inputs = self.get_graph_inputs_from_node_type(
- "EmbedLayerNormalization", [0, 1, 7], casted
- )
- inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
- return inputs
-
- def change_graph_input_type(
- self,
- graph: GraphProto,
- graph_input: ValueInfoProto,
- new_type: int = TensorProto.INT32,
- ):
- """Change graph input type, and add Cast node if needed.
-
- Args:
- graph (GraphProto): graph
- graph_input (TensorProto): input of the graph
- new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
- Returns:
- NodeProto: a new Cast node that added. None if Cast node is not added.
- List[NodeProto]: Cast nodes that have been removed.
- """
- assert isinstance(graph, GraphProto)
- assert isinstance(graph_input, ValueInfoProto)
- assert self.find_graph_input(graph_input.name)
-
- if graph_input.type.tensor_type.elem_type == int(new_type):
- return None, []
-
- new_cast_node = None
- nodes_to_remove = []
-
- input_name_to_nodes = self.input_name_to_nodes()
- if graph_input.name in input_name_to_nodes:
- nodes = input_name_to_nodes[graph_input.name]
-
- # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
- nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
- if nodes_not_cast:
- node_name = self.create_node_name("Cast")
- output_name = node_name + "_" + graph_input.name
- new_value_info = graph.value_info.add()
- new_value_info.CopyFrom(graph_input)
- new_value_info.name = output_name
- new_cast_node = helper.make_node(
- "Cast",
- [graph_input.name],
- [output_name],
- to=int(graph_input.type.tensor_type.elem_type),
- name=node_name,
- )
- graph.node.extend([new_cast_node])
-
- for node in nodes_not_cast:
- OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
- # For children that is Cast node, no need to insert Cast.
- # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
- nodes_cast = [node for node in nodes if node.op_type == "Cast"]
- for node in nodes_cast:
- if OnnxModel.get_node_attribute(node, "to") == int(new_type):
- self.replace_input_of_all_nodes(node.output[0], graph_input.name)
- if not self.find_graph_output(node.output[0]):
- nodes_to_remove.append(node)
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
-
- graph_input.type.tensor_type.elem_type = int(new_type)
- return new_cast_node, nodes_to_remove
-
- def change_graph_inputs_to_int32(self):
- """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
- graph = self.graph()
- add_cast_count = 0
- remove_cast_count = 0
- for graph_input in graph.input:
- new_node, removed_nodes = self.change_graph_input_type(
- graph, graph_input, TensorProto.INT32
- )
- if new_node:
- add_cast_count += 1
- remove_cast_count += len(removed_nodes)
- logger.info(
- f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
- )
-
- def use_dynamic_axes(
- self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
- ):
- """
- Update input and output shape to use dynamic axes.
- """
- bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
- casted=True
- ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
- dynamic_batch_inputs = {}
- for input in self.model.graph.input:
- if input.name in bert_graph_inputs:
- dim_proto = input.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
- if dynamic_seq_len is not None:
- dim_proto = input.type.tensor_type.shape.dim[1]
- dim_proto.dim_param = dynamic_seq_len
-
- for output in self.model.graph.output:
- dim_proto = output.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
-
- def preprocess(self):
- self.adjust_reshape_and_expand()
- return
-
- def adjust_reshape_and_expand(self):
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Reshape":
- # Clean up unneccessary reshape nodes.
- # Find reshape nodes with no actually data in "shape" attribute and remove.
- reshape_shape = self.get_constant_value(node.input[1])
- if reshape_shape is not None and reshape_shape.size == 0:
- nodes_to_remove.extend([node])
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- continue
-
- # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
- # changing current reshape's input to output of slice.
- reshape_path = self.match_parent_path(
- node,
- ["Expand", "Expand", "Reshape", "Slice"],
- [0, 0, 0, 0],
- self.output_name_to_node(),
- )
- if reshape_path is not None:
- expand_node = reshape_path[-3]
- expand_shape_value = self.get_constant_value(expand_node.input[1])
-
- reshape_before_expand = reshape_path[-2]
- shape_value = self.get_constant_value(
- reshape_before_expand.input[1]
- )
-
- slice_node = reshape_path[-1]
- if (
- expand_shape_value is not None
- and shape_value is not None
- and len(expand_shape_value) == 2
- and len(shape_value) == 1
- and expand_shape_value[1] == shape_value[0]
- ):
- node.input[0] = slice_node.output[0]
-
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
- logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
- def clean_graph(self):
- output_name_to_node = self.output_name_to_node()
- nodes_to_remove = []
- for node in self.nodes():
- # Before:
- # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
- # | |
- # | v
- # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # After:
- # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
- op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
- if node.op_type in op_input_id:
- i = op_input_id[node.op_type]
- parent_nodes = self.match_parent_path(
- node,
- [
- "Cast",
- "ConstantOfShape",
- "Concat",
- "Unsqueeze",
- "Gather",
- "Shape",
- ],
- [i, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- (
- cast,
- constantOfShape,
- concat,
- unsqueeze,
- gather,
- shape,
- ) = parent_nodes
- if shape.input[0] == self.graph().input[0].name:
- constantOfShape.input[0] = shape.output[0]
- output_name_to_node = self.output_name_to_node()
-
- if node.op_type == "Attention":
- # Before:
- # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
- # After:
- # remove this path, and remove the optional mask_index input of Attention node.
- parent_nodes = self.match_parent_path(
- node,
- ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- if parent_nodes[-1].input[0] == self.graph().input[0].name:
- attention_node = helper.make_node(
- "Attention",
- inputs=node.input[0 : len(node.input) - 1],
- outputs=node.output,
- name=node.name + "_remove_mask",
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", self.num_heads)]
- )
- self.add_node(
- attention_node, self.get_graph_by_node(attention_node).name
- )
- nodes_to_remove.append(node)
- self.remove_nodes(nodes_to_remove)
-
- def postprocess(self):
- self.clean_graph()
- self.prune_graph()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.preprocess()
-
- self.fuse_reshape()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- if options.enable_swint_opt:
- self.fuse_custom_fc()
- self.fuse_swinT_serial_bias_add()
-
- if options.enable_format_roformer:
- self.fuse_format_roformer()
-
- if options.enable_gpt2_classify or options.enable_vit:
- self.fuse_custom_fc_gpt2_classify()
-
- if options.enable_vit:
- self.fuse_custom_fc()
-
- # if (options is None) or options.enable_attention:
- # if options is not None:
- # self.attention_mask.set_mask_format(options.attention_mask_format)
- self.fuse_attention()
-
- self.conv_reformat()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- self.fuse_custom_fc()
-
- self.fuse_custom_xsoftmax()
-
- self.fuse_disentangled_attention()
-
- # Perform the MatMul fusion after the Attention fusion as we do not
- # want to fuse the MatMuls inside the Attention subgraphs
- if (options is None) or options.enable_qordered_matmul:
- self.fuse_qordered_mamtul()
-
- self.fuse_shape()
-
- if (options is None) or options.enable_embed_layer_norm:
- self.fuse_embed_layer()
-
- # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
- self.utils.remove_useless_reshape_nodes()
-
- self.postprocess()
-
- # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
- if (options is None) or options.enable_bias_gelu:
- # Fuse Gelu and Add Bias before it.
- self.fuse_bias_gelu(is_fastgelu=True)
- self.fuse_bias_gelu(is_fastgelu=False)
-
- if (options is None) or options.enable_bias_skip_layer_norm:
- # Fuse SkipLayerNormalization and Add Bias before it.
- self.fuse_add_bias_skip_layer_norm()
-
- if options is not None and options.enable_gelu_approximation:
- self.gelu_approximation()
-
- self.fuse_custom_fc_activation()
-
- self.remove_unused_constant()
-
- # Use symbolic batch dimension in input and output.
- if add_dynamic_axes:
- self.use_dynamic_axes()
-
- logger.info(f"opset version: {self.get_opset_version()}")
-
- def get_fused_operator_statistics(self):
- """
- Returns node count of fused operators.
- """
- op_count = {}
- ops = [
- "EmbedLayerNormalization",
- "Attention",
- "QOrderedAttention",
- "Gelu",
- "QOrderedGelu",
- "FastGelu",
- "BiasGelu",
- "LayerNormalization",
- "QOrderedLayerNormalization",
- "SkipLayerNormalization",
- "QOrderedMatMul",
- ]
- for op in ops:
- nodes = self.get_nodes_by_op_type(op)
- op_count[op] = len(nodes)
- logger.info(f"Optimized operators:{op_count}")
- return op_count
-
- def is_fully_optimized(self):
- """
- Returns True when the model is fully optimized.
- """
- op_count = self.get_fused_operator_statistics()
- embed = op_count["EmbedLayerNormalization"]
- attention = op_count["Attention"] + op_count["QOrderedAttention"]
- gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
- layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
- is_perfect = (
- (embed > 0)
- and (attention > 0)
- and (attention == gelu)
- and (layer_norm >= 2 * attention)
- )
-
- if layer_norm == 0:
- logger.debug("Layer Normalization not fused")
-
- if gelu == 0:
- logger.debug("Gelu/FastGelu not fused")
-
- if embed == 0:
- logger.debug("Embed Layer not fused")
-
- if attention == 0:
- logger.warning("Attention not fused")
-
- return is_perfect
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
deleted file mode 100644
index 7324603e61bb7a13a57e586827c8fa67a9af4ae2..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
- FusionCustomFC,
- FusionCustomFCActivation,
- FusionCustomFCGPT2,
- FusionTorchvisionVitCustomFC,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
- FusionFormatInvalidMask,
- FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
- FusionBiasSkipLayerNormalization,
- FusionSkipLayerNormalization,
-)
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_videobert_attention import FusionVideoBertAttention
-from passes.fusion_vit_attention import FusionVITAttention, FusionTorchvisionVITAttention
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.fuse_inverse_sigmoid import FusionLayerInverseSigmoid
-from passes.fuse_l2_normalization import FusionLayerL2Normalization
-from passes.fuse_omdet_attention import FusionLayerOmdetAttention
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class BertOptimizationOptions(FusionOptions):
- """This class is deprecated"""
-
- def __init__(self, model_type):
- logger.warning(
- f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
- )
- super().__init__(model_type)
-
-
-class BertOnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
- """Initialize BERT ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
-
- super().__init__(model)
- self.num_heads = num_heads
- self.hidden_size = hidden_size
-
- self.attention_mask = AttentionMask(self)
- self.attention_fusion = FusionAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.qordered_attention_fusion = FusionQOrderedAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.utils = FusionUtils(self)
-
- def fuse_attention(self):
- self.attention_fusion.apply()
- FusionAlbertAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- ).apply()
- FusionVideoBertAttention(self).apply()
- FusionVITAttention(self).apply()
- FusionTorchvisionVITAttention(self).apply()
- FusionSwinLAttention(self).apply()
- FusionGptAttentionNoPast(self).apply()
- # Only relevant in models with Q-DQ nodes
- self.qordered_attention_fusion.apply()
-
- def fuse_format_roformer(self):
- FusionRemoveUselessElementwise(self).apply()
- fusion = FusionFormatInvalidMask(self)
- fusion.apply()
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_torchvision_vit(self):
- fusion = FusionTorchvisionVitCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_activation(self):
- fusion = FusionCustomFCActivation(self)
- fusion.apply()
-
- def fuse_custom_fc_gpt2_classify(self):
- fusion = FusionCustomFCGPT2(self)
- fusion.apply()
-
- def fuse_swinT_serial_bias_add(self):
- fusion = FusionSerialBiasAdd(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_bias_gelu(self, is_fastgelu):
- fusion = FusionBiasGelu(self, is_fastgelu)
- fusion.apply()
-
- def fuse_custom_xsoftmax(self):
- fusion = FusionXSoftmax(self)
- fusion.apply()
-
- def fuse_disentangled_attention(self):
- fusion = FusionDisentangledAttention(self)
- fusion.apply()
-
- def gelu_approximation(self):
- fusion = FusionGeluApproximation(self)
- fusion.apply()
-
- def fuse_add_bias_skip_layer_norm(self):
- fusion = FusionBiasSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_embed_layer(self):
- fusion = FusionEmbedLayerNormalization(self)
- fusion.apply()
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalization(self, self.hidden_size)
- fusion.apply()
-
- fusion = FusionLayerNormalizationTF(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedLayerNormalization(self)
- fusion.apply()
-
- def fuse_skip_layer_norm(self):
- fusion = FusionSkipLayerNormalization(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- def fuse_qordered_mamtul(self):
- fusion = FusionQOrderedMatMul(self)
- fusion.apply()
-
- def fuse_omdet_inverse_sigmoid(self):
- fusion = FusionLayerInverseSigmoid(self)
- fusion.apply()
-
- def fuse_omdet_attention(self):
- fusion = FusionLayerOmdetAttention(self)
- fusion.apply()
-
- def fuse_l2_normalization(self):
- fusion = FusionLayerL2Normalization(self)
- fusion.apply()
-
- def get_graph_inputs_from_node_type(
- self, op_type: str, input_indices: List[int], casted: bool
- ):
- """
- Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
- Returns a list of the graph input names based on the filter whether it is casted or not.
- """
- graph_inputs = []
-
- output_name_to_node = self.output_name_to_node()
- nodes = self.get_nodes_by_op_type(op_type)
- for node in nodes:
- bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
- for bert_input in bert_inputs:
- if self.find_graph_input(bert_input):
- if not casted:
- graph_inputs.append(bert_input)
- elif bert_input in output_name_to_node:
- parent = output_name_to_node[bert_input]
- if (
- parent.op_type == "Cast"
- and self.find_graph_input(parent.input[0]) is not None
- ):
- if casted:
- graph_inputs.append(parent.input[0])
- return graph_inputs
-
- def get_graph_inputs_from_fused_nodes(self, casted: bool):
- inputs = self.get_graph_inputs_from_node_type(
- "EmbedLayerNormalization", [0, 1, 7], casted
- )
- inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
- return inputs
-
- def change_graph_input_type(
- self,
- graph: GraphProto,
- graph_input: ValueInfoProto,
- new_type: int = TensorProto.INT32,
- ):
- """Change graph input type, and add Cast node if needed.
-
- Args:
- graph (GraphProto): graph
- graph_input (TensorProto): input of the graph
- new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
- Returns:
- NodeProto: a new Cast node that added. None if Cast node is not added.
- List[NodeProto]: Cast nodes that have been removed.
- """
- assert isinstance(graph, GraphProto)
- assert isinstance(graph_input, ValueInfoProto)
- assert self.find_graph_input(graph_input.name)
-
- if graph_input.type.tensor_type.elem_type == int(new_type):
- return None, []
-
- new_cast_node = None
- nodes_to_remove = []
-
- input_name_to_nodes = self.input_name_to_nodes()
- if graph_input.name in input_name_to_nodes:
- nodes = input_name_to_nodes[graph_input.name]
-
- # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
- nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
- if nodes_not_cast:
- node_name = self.create_node_name("Cast")
- output_name = node_name + "_" + graph_input.name
- new_value_info = graph.value_info.add()
- new_value_info.CopyFrom(graph_input)
- new_value_info.name = output_name
- new_cast_node = helper.make_node(
- "Cast",
- [graph_input.name],
- [output_name],
- to=int(graph_input.type.tensor_type.elem_type),
- name=node_name,
- )
- graph.node.extend([new_cast_node])
-
- for node in nodes_not_cast:
- OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
- # For children that is Cast node, no need to insert Cast.
- # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
- nodes_cast = [node for node in nodes if node.op_type == "Cast"]
- for node in nodes_cast:
- if OnnxModel.get_node_attribute(node, "to") == int(new_type):
- self.replace_input_of_all_nodes(node.output[0], graph_input.name)
- if not self.find_graph_output(node.output[0]):
- nodes_to_remove.append(node)
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
-
- graph_input.type.tensor_type.elem_type = int(new_type)
- return new_cast_node, nodes_to_remove
-
- def change_graph_inputs_to_int32(self):
- """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
- graph = self.graph()
- add_cast_count = 0
- remove_cast_count = 0
- for graph_input in graph.input:
- new_node, removed_nodes = self.change_graph_input_type(
- graph, graph_input, TensorProto.INT32
- )
- if new_node:
- add_cast_count += 1
- remove_cast_count += len(removed_nodes)
- logger.info(
- f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
- )
-
- def use_dynamic_axes(
- self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
- ):
- """
- Update input and output shape to use dynamic axes.
- """
- bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
- casted=True
- ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
- dynamic_batch_inputs = {}
- for input in self.model.graph.input:
- if input.name in bert_graph_inputs:
- dim_proto = input.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
- if dynamic_seq_len is not None:
- dim_proto = input.type.tensor_type.shape.dim[1]
- dim_proto.dim_param = dynamic_seq_len
-
- for output in self.model.graph.output:
- dim_proto = output.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
-
- def preprocess(self):
- self.adjust_reshape_and_expand()
- return
-
- def adjust_reshape_and_expand(self):
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Reshape":
- # Clean up unneccessary reshape nodes.
- # Find reshape nodes with no actually data in "shape" attribute and remove.
- reshape_shape = self.get_constant_value(node.input[1])
- if reshape_shape is not None and reshape_shape.size == 0:
- nodes_to_remove.extend([node])
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- continue
-
- # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
- # changing current reshape's input to output of slice.
- reshape_path = self.match_parent_path(
- node,
- ["Expand", "Expand", "Reshape", "Slice"],
- [0, 0, 0, 0],
- self.output_name_to_node(),
- )
- if reshape_path is not None:
- expand_node = reshape_path[-3]
- expand_shape_value = self.get_constant_value(expand_node.input[1])
-
- reshape_before_expand = reshape_path[-2]
- shape_value = self.get_constant_value(
- reshape_before_expand.input[1]
- )
-
- slice_node = reshape_path[-1]
- if (
- expand_shape_value is not None
- and shape_value is not None
- and len(expand_shape_value) == 2
- and len(shape_value) == 1
- and expand_shape_value[1] == shape_value[0]
- ):
- node.input[0] = slice_node.output[0]
-
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
- logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
- def clean_graph(self):
- output_name_to_node = self.output_name_to_node()
- nodes_to_remove = []
- for node in self.nodes():
- # Before:
- # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
- # | |
- # | v
- # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # After:
- # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
- op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
- if node.op_type in op_input_id:
- i = op_input_id[node.op_type]
- parent_nodes = self.match_parent_path(
- node,
- [
- "Cast",
- "ConstantOfShape",
- "Concat",
- "Unsqueeze",
- "Gather",
- "Shape",
- ],
- [i, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- (
- cast,
- constantOfShape,
- concat,
- unsqueeze,
- gather,
- shape,
- ) = parent_nodes
- if shape.input[0] == self.graph().input[0].name:
- constantOfShape.input[0] = shape.output[0]
- output_name_to_node = self.output_name_to_node()
-
- if node.op_type == "Attention":
- # Before:
- # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
- # After:
- # remove this path, and remove the optional mask_index input of Attention node.
- parent_nodes = self.match_parent_path(
- node,
- ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- if parent_nodes[-1].input[0] == self.graph().input[0].name:
- attention_node = helper.make_node(
- "Attention",
- inputs=node.input[0 : len(node.input) - 1],
- outputs=node.output,
- name=node.name + "_remove_mask",
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", self.num_heads)]
- )
- self.add_node(
- attention_node, self.get_graph_by_node(attention_node).name
- )
- nodes_to_remove.append(node)
- self.remove_nodes(nodes_to_remove)
-
- def postprocess(self):
- self.clean_graph()
- self.prune_graph()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.preprocess()
-
- self.fuse_reshape()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- if options.enable_swint_opt:
- self.fuse_custom_fc()
- self.fuse_swinT_serial_bias_add()
-
- if options.enable_format_roformer:
- self.fuse_format_roformer()
-
- if options.enable_gpt2_classify or options.enable_vit:
- self.fuse_custom_fc_gpt2_classify()
-
- if options.enable_vit:
- self.fuse_custom_fc()
-
- if (options is None) or options.enable_attention:
- if options is not None:
- self.attention_mask.set_mask_format(options.attention_mask_format)
- self.fuse_attention()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- self.fuse_custom_fc()
-
- if options.enable_omdet:
- self.fuse_omdet_attention()
- self.fuse_omdet_inverse_sigmoid()
- self.fuse_l2_normalization()
-
- self.fuse_custom_xsoftmax()
-
- self.fuse_disentangled_attention()
-
- # Perform the MatMul fusion after the Attention fusion as we do not
- # want to fuse the MatMuls inside the Attention subgraphs
- if (options is None) or options.enable_qordered_matmul:
- self.fuse_qordered_mamtul()
-
- self.fuse_shape()
-
- if (options is None) or options.enable_embed_layer_norm:
- self.fuse_embed_layer()
-
- # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
- self.utils.remove_useless_reshape_nodes()
-
- self.postprocess()
-
- # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
- if (options is None) or options.enable_bias_gelu:
- # Fuse Gelu and Add Bias before it.
- self.fuse_bias_gelu(is_fastgelu=True)
- self.fuse_bias_gelu(is_fastgelu=False)
-
- if (options is None) or options.enable_bias_skip_layer_norm:
- # Fuse SkipLayerNormalization and Add Bias before it.
- self.fuse_add_bias_skip_layer_norm()
-
- if options is not None and options.enable_gelu_approximation:
- self.gelu_approximation()
-
- self.fuse_custom_fc_activation()
-
- if options.enable_vit:
- self.fuse_custom_fc_torchvision_vit()
-
- self.remove_unused_constant()
-
- # Use symbolic batch dimension in input and output.
- if add_dynamic_axes:
- self.use_dynamic_axes()
-
- logger.info(f"opset version: {self.get_opset_version()}")
-
- def get_fused_operator_statistics(self):
- """
- Returns node count of fused operators.
- """
- op_count = {}
- ops = [
- "EmbedLayerNormalization",
- "Attention",
- "QOrderedAttention",
- "Gelu",
- "QOrderedGelu",
- "FastGelu",
- "BiasGelu",
- "LayerNormalization",
- "QOrderedLayerNormalization",
- "SkipLayerNormalization",
- "QOrderedMatMul",
- ]
- for op in ops:
- nodes = self.get_nodes_by_op_type(op)
- op_count[op] = len(nodes)
- logger.info(f"Optimized operators:{op_count}")
- return op_count
-
- def is_fully_optimized(self):
- """
- Returns True when the model is fully optimized.
- """
- op_count = self.get_fused_operator_statistics()
- embed = op_count["EmbedLayerNormalization"]
- attention = op_count["Attention"] + op_count["QOrderedAttention"]
- gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
- layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
- is_perfect = (
- (embed > 0)
- and (attention > 0)
- and (attention == gelu)
- and (layer_norm >= 2 * attention)
- )
-
- if layer_norm == 0:
- logger.debug("Layer Normalization not fused")
-
- if gelu == 0:
- logger.debug("Gelu/FastGelu not fused")
-
- if embed == 0:
- logger.debug("Embed Layer not fused")
-
- if attention == 0:
- logger.warning("Attention not fused")
-
- return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
deleted file mode 100644
index cc59c37bd48f677a7d06f141f45eaa55aef54656..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
+++ /dev/null
@@ -1,591 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_conformer_attention import FusionConformerAttention
-from passes.fusion_conformer_xsoftmax import FusionConformerXSoftmax
-from passes.fusion_customfc import (
- FusionConformerCustomFCActivation,
- FusionCustomFC,
- FusionCustomFCGPT2,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
- FusionFormatInvalidMask,
- FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
- FusionBiasSkipLayerNormalization,
- FusionSkipLayerNormalization,
-)
-from passes.fusion_splitQKV import FusionSplitQKV
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_vit_attention import FusionVITAttention
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class ConformerOptimizationOptions(FusionOptions):
- """This class is deprecated"""
-
- def __init__(self, model_type):
- logger.warning(
- f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
- )
- super().__init__(model_type)
-
-
-class conformerOnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
- """Initialize BERT ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
-
- super().__init__(model)
- self.num_heads = num_heads
- self.hidden_size = hidden_size
-
- self.attention_mask = AttentionMask(self)
- self.attention_fusion = FusionAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.qordered_attention_fusion = FusionQOrderedAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.utils = FusionUtils(self)
-
- def fuse_attention(self):
- FusionConformerAttention(self, self.hidden_size, self.num_heads).apply()
- # Only relevant in models with Q-DQ nodes
- self.qordered_attention_fusion.apply()
-
- def fuse_format_roformer(self):
- FusionRemoveUselessElementwise(self).apply()
- fusion = FusionFormatInvalidMask(self)
- fusion.apply()
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_conformer_activation(self):
- fusion = FusionConformerCustomFCActivation(self)
- fusion.apply()
-
- def fuse_custom_fc_gpt2_classify(self):
- fusion = FusionCustomFCGPT2(self)
- fusion.apply()
-
- def fuse_swinT_serial_bias_add(self):
- fusion = FusionSerialBiasAdd(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_bias_gelu(self, is_fastgelu):
- fusion = FusionBiasGelu(self, is_fastgelu)
- fusion.apply()
-
- def fuse_custom_xsoftmax(self):
- fusion = FusionConformerXSoftmax(self)
- fusion.apply()
-
- def fuse_disentangled_attention(self):
- fusion = FusionDisentangledAttention(self)
- fusion.apply()
-
- def gelu_approximation(self):
- fusion = FusionGeluApproximation(self)
- fusion.apply()
-
- def fuse_add_bias_skip_layer_norm(self):
- fusion = FusionBiasSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_embed_layer(self):
- fusion = FusionEmbedLayerNormalization(self)
- fusion.apply()
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalization(self, self.hidden_size)
- fusion.apply()
-
- fusion = FusionLayerNormalizationTF(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedLayerNormalization(self)
- fusion.apply()
-
- def fuse_skip_layer_norm(self):
- fusion = FusionSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_split_qkv(self):
- fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- def fuse_qordered_mamtul(self):
- fusion = FusionQOrderedMatMul(self)
- fusion.apply()
-
- def get_graph_inputs_from_node_type(
- self, op_type: str, input_indices: List[int], casted: bool
- ):
- """
- Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
- Returns a list of the graph input names based on the filter whether it is casted or not.
- """
- graph_inputs = []
-
- output_name_to_node = self.output_name_to_node()
- nodes = self.get_nodes_by_op_type(op_type)
- for node in nodes:
- bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
- for bert_input in bert_inputs:
- if self.find_graph_input(bert_input):
- if not casted:
- graph_inputs.append(bert_input)
- elif bert_input in output_name_to_node:
- parent = output_name_to_node[bert_input]
- if (
- parent.op_type == "Cast"
- and self.find_graph_input(parent.input[0]) is not None
- ):
- if casted:
- graph_inputs.append(parent.input[0])
- return graph_inputs
-
- def get_graph_inputs_from_fused_nodes(self, casted: bool):
- inputs = self.get_graph_inputs_from_node_type(
- "EmbedLayerNormalization", [0, 1, 7], casted
- )
- inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
- return inputs
-
- def change_graph_input_type(
- self,
- graph: GraphProto,
- graph_input: ValueInfoProto,
- new_type: int = TensorProto.INT32,
- ):
- """Change graph input type, and add Cast node if needed.
-
- Args:
- graph (GraphProto): graph
- graph_input (TensorProto): input of the graph
- new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
- Returns:
- NodeProto: a new Cast node that added. None if Cast node is not added.
- List[NodeProto]: Cast nodes that have been removed.
- """
- assert isinstance(graph, GraphProto)
- assert isinstance(graph_input, ValueInfoProto)
- assert self.find_graph_input(graph_input.name)
-
- if graph_input.type.tensor_type.elem_type == int(new_type):
- return None, []
-
- new_cast_node = None
- nodes_to_remove = []
-
- input_name_to_nodes = self.input_name_to_nodes()
- if graph_input.name in input_name_to_nodes:
- nodes = input_name_to_nodes[graph_input.name]
-
- # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
- nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
- if nodes_not_cast:
- node_name = self.create_node_name("Cast")
- output_name = node_name + "_" + graph_input.name
- new_value_info = graph.value_info.add()
- new_value_info.CopyFrom(graph_input)
- new_value_info.name = output_name
- new_cast_node = helper.make_node(
- "Cast",
- [graph_input.name],
- [output_name],
- to=int(graph_input.type.tensor_type.elem_type),
- name=node_name,
- )
- graph.node.extend([new_cast_node])
-
- for node in nodes_not_cast:
- OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
- # For children that is Cast node, no need to insert Cast.
- # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
- nodes_cast = [node for node in nodes if node.op_type == "Cast"]
- for node in nodes_cast:
- if OnnxModel.get_node_attribute(node, "to") == int(new_type):
- self.replace_input_of_all_nodes(node.output[0], graph_input.name)
- if not self.find_graph_output(node.output[0]):
- nodes_to_remove.append(node)
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
-
- graph_input.type.tensor_type.elem_type = int(new_type)
- return new_cast_node, nodes_to_remove
-
- def change_graph_inputs_to_int32(self):
- """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
- graph = self.graph()
- add_cast_count = 0
- remove_cast_count = 0
- for graph_input in graph.input:
- new_node, removed_nodes = self.change_graph_input_type(
- graph, graph_input, TensorProto.INT32
- )
- if new_node:
- add_cast_count += 1
- remove_cast_count += len(removed_nodes)
- logger.info(
- f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
- )
-
- def use_dynamic_axes(
- self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
- ):
- """
- Update input and output shape to use dynamic axes.
- """
- bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
- casted=True
- ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
- dynamic_batch_inputs = {}
- for input in self.model.graph.input:
- if input.name in bert_graph_inputs:
- dim_proto = input.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
- if dynamic_seq_len is not None:
- dim_proto = input.type.tensor_type.shape.dim[1]
- dim_proto.dim_param = dynamic_seq_len
-
- for output in self.model.graph.output:
- dim_proto = output.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
-
- def preprocess(self):
- self.adjust_reshape_and_expand()
- return
-
- def adjust_reshape_and_expand(self):
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Reshape":
- # Clean up unneccessary reshape nodes.
- # Find reshape nodes with no actually data in "shape" attribute and remove.
- reshape_shape = self.get_constant_value(node.input[1])
- if reshape_shape is not None and reshape_shape.size == 0:
- nodes_to_remove.extend([node])
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- continue
-
- # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
- # changing current reshape's input to output of slice.
- reshape_path = self.match_parent_path(
- node,
- ["Expand", "Expand", "Reshape", "Slice"],
- [0, 0, 0, 0],
- self.output_name_to_node(),
- )
- if reshape_path is not None:
- expand_node = reshape_path[-3]
- expand_shape_value = self.get_constant_value(expand_node.input[1])
-
- reshape_before_expand = reshape_path[-2]
- shape_value = self.get_constant_value(
- reshape_before_expand.input[1]
- )
-
- slice_node = reshape_path[-1]
- if (
- expand_shape_value is not None
- and shape_value is not None
- and len(expand_shape_value) == 2
- and len(shape_value) == 1
- and expand_shape_value[1] == shape_value[0]
- ):
- node.input[0] = slice_node.output[0]
-
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
- logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
- def clean_graph(self):
- output_name_to_node = self.output_name_to_node()
- nodes_to_remove = []
- for node in self.nodes():
- # Before:
- # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
- # | |
- # | v
- # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # After:
- # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
- op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
- if node.op_type in op_input_id:
- i = op_input_id[node.op_type]
- parent_nodes = self.match_parent_path(
- node,
- [
- "Cast",
- "ConstantOfShape",
- "Concat",
- "Unsqueeze",
- "Gather",
- "Shape",
- ],
- [i, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- (
- cast,
- constantOfShape,
- concat,
- unsqueeze,
- gather,
- shape,
- ) = parent_nodes
- if shape.input[0] == self.graph().input[0].name:
- constantOfShape.input[0] = shape.output[0]
- output_name_to_node = self.output_name_to_node()
-
- if node.op_type == "Attention":
- # Before:
- # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
- # After:
- # remove this path, and remove the optional mask_index input of Attention node.
- parent_nodes = self.match_parent_path(
- node,
- ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- if parent_nodes[-1].input[0] == self.graph().input[0].name:
- attention_node = helper.make_node(
- "Attention",
- inputs=node.input[0 : len(node.input) - 1],
- outputs=node.output,
- name=node.name + "_remove_mask",
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", self.num_heads)]
- )
- self.add_node(
- attention_node, self.get_graph_by_node(attention_node).name
- )
- nodes_to_remove.append(node)
- self.remove_nodes(nodes_to_remove)
-
- def postprocess(self):
- self.clean_graph()
- self.prune_graph()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.preprocess()
-
- self.fuse_reshape()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- if options.enable_swint_opt:
- self.fuse_custom_fc()
- self.fuse_swinT_serial_bias_add()
-
- if options.enable_format_roformer:
- self.fuse_format_roformer()
-
- if options.enable_gpt2_classify or options.enable_vit:
- self.fuse_custom_fc_gpt2_classify()
-
- if options.enable_vit:
- self.fuse_custom_fc()
-
- self.fuse_custom_fc()
- self.fuse_custom_xsoftmax()
-
- self.fuse_attention()
-
- self.fuse_split_qkv()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- # Perform the MatMul fusion after the Attention fusion as we do not
- # want to fuse the MatMuls inside the Attention subgraphs
- if (options is None) or options.enable_qordered_matmul:
- self.fuse_qordered_mamtul()
-
- self.fuse_shape()
-
- if (options is None) or options.enable_embed_layer_norm:
- self.fuse_embed_layer()
-
- # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
- self.utils.remove_useless_reshape_nodes()
-
- self.postprocess()
-
- # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
- if (options is None) or options.enable_bias_gelu:
- # Fuse Gelu and Add Bias before it.
- self.fuse_bias_gelu(is_fastgelu=True)
- self.fuse_bias_gelu(is_fastgelu=False)
-
- if (options is None) or options.enable_bias_skip_layer_norm:
- # Fuse SkipLayerNormalization and Add Bias before it.
- self.fuse_add_bias_skip_layer_norm()
-
- if options is not None and options.enable_gelu_approximation:
- self.gelu_approximation()
-
- self.remove_unused_constant()
- self.fuse_custom_fc_conformer_activation()
-
- # Use symbolic batch dimension in input and output.
- if add_dynamic_axes:
- self.use_dynamic_axes()
-
- logger.info(f"opset version: {self.get_opset_version()}")
-
- def get_fused_operator_statistics(self):
- """
- Returns node count of fused operators.
- """
- op_count = {}
- ops = [
- "EmbedLayerNormalization",
- "Attention",
- "QOrderedAttention",
- "Gelu",
- "QOrderedGelu",
- "FastGelu",
- "BiasGelu",
- "LayerNormalization",
- "QOrderedLayerNormalization",
- "SkipLayerNormalization",
- "QOrderedMatMul",
- ]
- for op in ops:
- nodes = self.get_nodes_by_op_type(op)
- op_count[op] = len(nodes)
- logger.info(f"Optimized operators:{op_count}")
- return op_count
-
- def is_fully_optimized(self):
- """
- Returns True when the model is fully optimized.
- """
- op_count = self.get_fused_operator_statistics()
- embed = op_count["EmbedLayerNormalization"]
- attention = op_count["Attention"] + op_count["QOrderedAttention"]
- gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
- layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
- is_perfect = (
- (embed > 0)
- and (attention > 0)
- and (attention == gelu)
- and (layer_norm >= 2 * attention)
- )
-
- if layer_norm == 0:
- logger.debug("Layer Normalization not fused")
-
- if gelu == 0:
- logger.debug("Gelu/FastGelu not fused")
-
- if embed == 0:
- logger.debug("Embed Layer not fused")
-
- if attention == 0:
- logger.warning("Attention not fused")
-
- return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py
deleted file mode 100755
index 98cfc6699ab5276f2fd37915a62487a173fb4d12..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py
+++ /dev/null
@@ -1,640 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
- FusionCustomFC,
- FusionCustomFCActivation,
- FusionCustomFCGPT2,
- FusionTorchvisionVitCustomFC,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
- FusionFormatInvalidMask,
- FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
- FusionBiasSkipLayerNormalization,
- FusionSkipLayerNormalization,
-)
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_videobert_attention import FusionVideoBertAttention
-from passes.fusion_vit_attention import FusionVITAttention, FusionTorchvisionVITAttention
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.fuse_inverse_sigmoid import FusionLayerInverseSigmoid
-from passes.fuse_l2_normalization import FusionLayerL2Normalization
-from passes.fuse_omdet_attention import FusionLayerOmdetAttention
-from passes.onnx_model import OnnxModel
-
-from passes.fusion_cosyvoice_splitQKV_update_KVcache import FusionCosyVoiceSplitQKVUpdateKVCache
-from passes.fusion_cosyvoice_attention import (
- FusionCosyvoiceAttention
-)
-from passes.fusion_cosyvoice_splitQKV import FusionSplitQKV
-
-
-
-logger = getLogger(__name__)
-
-
-
-class cosyvoiceOnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads: int = 16, hidden_size: int = 1024):
- """Initialize BERT ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
-
- super().__init__(model)
- self.num_heads = num_heads
- self.hidden_size = hidden_size
-
- self.attention_mask = AttentionMask(self)
- self.attention_fusion = FusionAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.qordered_attention_fusion = FusionQOrderedAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.utils = FusionUtils(self)
-
- def fuse_attention(self):
- self.attention_fusion.apply()
- FusionAlbertAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- ).apply()
- FusionVideoBertAttention(self).apply()
- FusionVITAttention(self).apply()
- FusionTorchvisionVITAttention(self).apply()
- FusionSwinLAttention(self).apply()
- FusionGptAttentionNoPast(self).apply()
- # Only relevant in models with Q-DQ nodes
- self.qordered_attention_fusion.apply()
-
- def fuse_format_roformer(self):
- FusionRemoveUselessElementwise(self).apply()
- fusion = FusionFormatInvalidMask(self)
- fusion.apply()
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_torchvision_vit(self):
- fusion = FusionTorchvisionVitCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_activation(self):
- fusion = FusionCustomFCActivation(self)
- fusion.apply()
-
- def fuse_custom_fc_gpt2_classify(self):
- fusion = FusionCustomFCGPT2(self)
- fusion.apply()
-
- def fuse_swinT_serial_bias_add(self):
- fusion = FusionSerialBiasAdd(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_bias_gelu(self, is_fastgelu):
- fusion = FusionBiasGelu(self, is_fastgelu)
- fusion.apply()
-
- def fuse_custom_xsoftmax(self):
- fusion = FusionXSoftmax(self)
- fusion.apply()
-
- def fuse_disentangled_attention(self):
- fusion = FusionDisentangledAttention(self)
- fusion.apply()
-
- def gelu_approximation(self):
- fusion = FusionGeluApproximation(self)
- fusion.apply()
-
- def fuse_add_bias_skip_layer_norm(self):
- fusion = FusionBiasSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_embed_layer(self):
- fusion = FusionEmbedLayerNormalization(self)
- fusion.apply()
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalization(self, self.hidden_size)
- fusion.apply()
-
- fusion = FusionLayerNormalizationTF(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedLayerNormalization(self)
- fusion.apply()
-
- def fuse_skip_layer_norm(self):
- fusion = FusionSkipLayerNormalization(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- def fuse_qordered_mamtul(self):
- fusion = FusionQOrderedMatMul(self)
- fusion.apply()
-
- def fuse_omdet_inverse_sigmoid(self):
- fusion = FusionLayerInverseSigmoid(self)
- fusion.apply()
-
- def fuse_omdet_attention(self):
- fusion = FusionLayerOmdetAttention(self)
- fusion.apply()
-
- def fuse_l2_normalization(self):
- fusion = FusionLayerL2Normalization(self)
- fusion.apply()
-
- def fuse_splitQKV_update_kv_cache(self):
- fusion = FusionCosyVoiceSplitQKVUpdateKVCache(self, self.hidden_size, self.num_heads)
- fusion.apply()
-
- def fuse_cosyvoice_attention(self):
- fusion = FusionCosyvoiceAttention(self)
- fusion.apply()
-
- def fuse_cosyvoice_split_qkv(self):
- fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
- fusion.apply()
-
-
- def get_graph_inputs_from_node_type(
- self, op_type: str, input_indices: List[int], casted: bool
- ):
- """
- Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
- Returns a list of the graph input names based on the filter whether it is casted or not.
- """
- graph_inputs = []
-
- output_name_to_node = self.output_name_to_node()
- nodes = self.get_nodes_by_op_type(op_type)
- for node in nodes:
- bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
- for bert_input in bert_inputs:
- if self.find_graph_input(bert_input):
- if not casted:
- graph_inputs.append(bert_input)
- elif bert_input in output_name_to_node:
- parent = output_name_to_node[bert_input]
- if (
- parent.op_type == "Cast"
- and self.find_graph_input(parent.input[0]) is not None
- ):
- if casted:
- graph_inputs.append(parent.input[0])
- return graph_inputs
-
- def get_graph_inputs_from_fused_nodes(self, casted: bool):
- inputs = self.get_graph_inputs_from_node_type(
- "EmbedLayerNormalization", [0, 1, 7], casted
- )
- inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
- return inputs
-
- def change_graph_input_type(
- self,
- graph: GraphProto,
- graph_input: ValueInfoProto,
- new_type: int = TensorProto.INT32,
- ):
- """Change graph input type, and add Cast node if needed.
-
- Args:
- graph (GraphProto): graph
- graph_input (TensorProto): input of the graph
- new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
- Returns:
- NodeProto: a new Cast node that added. None if Cast node is not added.
- List[NodeProto]: Cast nodes that have been removed.
- """
- assert isinstance(graph, GraphProto)
- assert isinstance(graph_input, ValueInfoProto)
- assert self.find_graph_input(graph_input.name)
-
- if graph_input.type.tensor_type.elem_type == int(new_type):
- return None, []
-
- new_cast_node = None
- nodes_to_remove = []
-
- input_name_to_nodes = self.input_name_to_nodes()
- if graph_input.name in input_name_to_nodes:
- nodes = input_name_to_nodes[graph_input.name]
-
- # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
- nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
- if nodes_not_cast:
- node_name = self.create_node_name("Cast")
- output_name = node_name + "_" + graph_input.name
- new_value_info = graph.value_info.add()
- new_value_info.CopyFrom(graph_input)
- new_value_info.name = output_name
- new_cast_node = helper.make_node(
- "Cast",
- [graph_input.name],
- [output_name],
- to=int(graph_input.type.tensor_type.elem_type),
- name=node_name,
- )
- graph.node.extend([new_cast_node])
-
- for node in nodes_not_cast:
- OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
- # For children that is Cast node, no need to insert Cast.
- # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
- nodes_cast = [node for node in nodes if node.op_type == "Cast"]
- for node in nodes_cast:
- if OnnxModel.get_node_attribute(node, "to") == int(new_type):
- self.replace_input_of_all_nodes(node.output[0], graph_input.name)
- if not self.find_graph_output(node.output[0]):
- nodes_to_remove.append(node)
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
-
- graph_input.type.tensor_type.elem_type = int(new_type)
- return new_cast_node, nodes_to_remove
-
- def change_graph_inputs_to_int32(self):
- """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
- graph = self.graph()
- add_cast_count = 0
- remove_cast_count = 0
- for graph_input in graph.input:
- new_node, removed_nodes = self.change_graph_input_type(
- graph, graph_input, TensorProto.INT32
- )
- if new_node:
- add_cast_count += 1
- remove_cast_count += len(removed_nodes)
- logger.info(
- f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
- )
-
- def use_dynamic_axes(
- self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
- ):
- """
- Update input and output shape to use dynamic axes.
- """
- bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
- casted=True
- ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
- dynamic_batch_inputs = {}
- for input in self.model.graph.input:
- if input.name in bert_graph_inputs:
- dim_proto = input.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
- if dynamic_seq_len is not None:
- dim_proto = input.type.tensor_type.shape.dim[1]
- dim_proto.dim_param = dynamic_seq_len
-
- for output in self.model.graph.output:
- dim_proto = output.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
-
- def preprocess(self):
- self.adjust_reshape_and_expand()
- return
-
- def adjust_reshape_and_expand(self):
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Reshape":
- # Clean up unneccessary reshape nodes.
- # Find reshape nodes with no actually data in "shape" attribute and remove.
- reshape_shape = self.get_constant_value(node.input[1])
- if reshape_shape is not None and reshape_shape.size == 0:
- nodes_to_remove.extend([node])
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- continue
-
- # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
- # changing current reshape's input to output of slice.
- reshape_path = self.match_parent_path(
- node,
- ["Expand", "Expand", "Reshape", "Slice"],
- [0, 0, 0, 0],
- self.output_name_to_node(),
- )
- if reshape_path is not None:
- expand_node = reshape_path[-3]
- expand_shape_value = self.get_constant_value(expand_node.input[1])
-
- reshape_before_expand = reshape_path[-2]
- shape_value = self.get_constant_value(
- reshape_before_expand.input[1]
- )
-
- slice_node = reshape_path[-1]
- if (
- expand_shape_value is not None
- and shape_value is not None
- and len(expand_shape_value) == 2
- and len(shape_value) == 1
- and expand_shape_value[1] == shape_value[0]
- ):
- node.input[0] = slice_node.output[0]
-
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
- logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
- def clean_graph(self):
- output_name_to_node = self.output_name_to_node()
- nodes_to_remove = []
- for node in self.nodes():
- # Before:
- # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
- # | |
- # | v
- # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # After:
- # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
- op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
- if node.op_type in op_input_id:
- i = op_input_id[node.op_type]
- parent_nodes = self.match_parent_path(
- node,
- [
- "Cast",
- "ConstantOfShape",
- "Concat",
- "Unsqueeze",
- "Gather",
- "Shape",
- ],
- [i, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- (
- cast,
- constantOfShape,
- concat,
- unsqueeze,
- gather,
- shape,
- ) = parent_nodes
- if shape.input[0] == self.graph().input[0].name:
- constantOfShape.input[0] = shape.output[0]
- output_name_to_node = self.output_name_to_node()
-
- if node.op_type == "Attention":
- # Before:
- # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
- # After:
- # remove this path, and remove the optional mask_index input of Attention node.
- parent_nodes = self.match_parent_path(
- node,
- ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- if parent_nodes[-1].input[0] == self.graph().input[0].name:
- attention_node = helper.make_node(
- "Attention",
- inputs=node.input[0 : len(node.input) - 1],
- outputs=node.output,
- name=node.name + "_remove_mask",
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", self.num_heads)]
- )
- self.add_node(
- attention_node, self.get_graph_by_node(attention_node).name
- )
- nodes_to_remove.append(node)
- self.remove_nodes(nodes_to_remove)
-
- def postprocess(self):
- self.clean_graph()
- self.prune_graph()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.preprocess()
-
- self.fuse_reshape()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- if options.enable_swint_opt:
- self.fuse_custom_fc()
- self.fuse_swinT_serial_bias_add()
-
- if options.enable_format_roformer:
- self.fuse_format_roformer()
-
- if options.enable_gpt2_classify or options.enable_vit:
- self.fuse_custom_fc_gpt2_classify()
-
- if options.enable_vit:
- self.fuse_custom_fc()
-
- if (options is None) or options.enable_attention:
- if options is not None:
- self.attention_mask.set_mask_format(options.attention_mask_format)
- self.fuse_attention()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- self.fuse_custom_fc()
-
- if options.enable_omdet:
- self.fuse_omdet_attention()
- self.fuse_omdet_inverse_sigmoid()
- self.fuse_l2_normalization()
-
- self.fuse_splitQKV_update_kv_cache()
- self.fuse_cosyvoice_attention()
- self.fuse_cosyvoice_split_qkv()
-
-
- # Perform the MatMul fusion after the Attention fusion as we do not
- # want to fuse the MatMuls inside the Attention subgraphs
- if (options is None) or options.enable_qordered_matmul:
- self.fuse_qordered_mamtul()
-
- self.fuse_shape()
-
- if (options is None) or options.enable_embed_layer_norm:
- self.fuse_embed_layer()
-
- # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
- self.utils.remove_useless_reshape_nodes()
-
- self.postprocess()
-
- # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
- if (options is None) or options.enable_bias_gelu:
- # Fuse Gelu and Add Bias before it.
- self.fuse_bias_gelu(is_fastgelu=True)
- self.fuse_bias_gelu(is_fastgelu=False)
-
- if (options is None) or options.enable_bias_skip_layer_norm:
- # Fuse SkipLayerNormalization and Add Bias before it.
- self.fuse_add_bias_skip_layer_norm()
-
- if options is not None and options.enable_gelu_approximation:
- self.gelu_approximation()
-
- self.fuse_custom_fc_activation()
-
- if options.enable_vit:
- self.fuse_custom_fc_torchvision_vit()
-
- self.remove_unused_constant()
-
- # Use symbolic batch dimension in input and output.
- if add_dynamic_axes:
- self.use_dynamic_axes()
-
- logger.info(f"opset version: {self.get_opset_version()}")
-
- def get_fused_operator_statistics(self):
- """
- Returns node count of fused operators.
- """
- op_count = {}
- ops = [
- "EmbedLayerNormalization",
- "Attention",
- "QOrderedAttention",
- "Gelu",
- "QOrderedGelu",
- "FastGelu",
- "BiasGelu",
- "LayerNormalization",
- "QOrderedLayerNormalization",
- "SkipLayerNormalization",
- "QOrderedMatMul",
- ]
- for op in ops:
- nodes = self.get_nodes_by_op_type(op)
- op_count[op] = len(nodes)
- logger.info(f"Optimized operators:{op_count}")
- return op_count
-
- def is_fully_optimized(self):
- """
- Returns True when the model is fully optimized.
- """
- op_count = self.get_fused_operator_statistics()
- embed = op_count["EmbedLayerNormalization"]
- attention = op_count["Attention"] + op_count["QOrderedAttention"]
- gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
- layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
- is_perfect = (
- (embed > 0)
- and (attention > 0)
- and (attention == gelu)
- and (layer_norm >= 2 * attention)
- )
-
- if layer_norm == 0:
- logger.debug("Layer Normalization not fused")
-
- if gelu == 0:
- logger.debug("Gelu/FastGelu not fused")
-
- if embed == 0:
- logger.debug("Embed Layer not fused")
-
- if attention == 0:
- logger.warning("Attention not fused")
-
- return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
deleted file mode 100644
index 7bffb2e7cbec870423cd006d33a617dd1e70d1fb..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
- FusionCustomFC,
- FusionCustomFCActivation,
- FusionCustomFcRoformer,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
- FusionFormatInvalidMask,
- FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_layernorm import (
- FusionLayerNormalization,
- FusionLayerNormalizationKeras,
- FusionLayerNormalizationTF,
-)
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_roformer_attention import FusionRoformerCrossAttention
-from passes.fusion_rope import FusionRoPE
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
- FusionBiasSkipLayerNormalization,
- FusionSkipLayerNormalization,
-)
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_videobert_attention import FusionVideoBertAttention
-from passes.fusion_vit_attention import FusionVITAttention
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class RoformerOnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
- """Initialize BERT ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
-
- super().__init__(model)
- self.num_heads = num_heads
- self.hidden_size = hidden_size
-
- self.attention_mask = AttentionMask(self)
- self.attention_fusion = FusionAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.qordered_attention_fusion = FusionQOrderedAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.utils = FusionUtils(self)
-
- def fuse_attention(self):
- FusionRoformerCrossAttention(self).apply()
-
- def fuse_format_roformer(self):
- # FusionRemoveUselessElementwise(self).apply()
- fusion = FusionFormatInvalidMask(self)
- fusion.apply()
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_activation(self):
- fusion = FusionCustomFCActivation(self)
- fusion.apply()
-
- def fuse_custom_fc_roformer(self):
- fusion = FusionCustomFcRoformer(self)
- fusion.apply()
-
- def fuse_rope(self):
- fusion = FusionRoPE(self)
- fusion.apply()
-
- def fuse_swinT_serial_bias_add(self):
- fusion = FusionSerialBiasAdd(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_bias_gelu(self, is_fastgelu):
- fusion = FusionBiasGelu(self, is_fastgelu)
- fusion.apply()
-
- def gelu_approximation(self):
- fusion = FusionGeluApproximation(self)
- fusion.apply()
-
- def fuse_add_bias_skip_layer_norm(self):
- fusion = FusionBiasSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_embed_layer(self):
- fusion = FusionEmbedLayerNormalization(self)
- fusion.apply()
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalizationKeras(self)
- fusion.apply()
-
- def fuse_skip_layer_norm(self):
- fusion = FusionSkipLayerNormalization(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- def fuse_qordered_mamtul(self):
- fusion = FusionQOrderedMatMul(self)
- fusion.apply()
-
- def get_graph_inputs_from_node_type(
- self, op_type: str, input_indices: List[int], casted: bool
- ):
- """
- Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
- Returns a list of the graph input names based on the filter whether it is casted or not.
- """
- graph_inputs = []
-
- output_name_to_node = self.output_name_to_node()
- nodes = self.get_nodes_by_op_type(op_type)
- for node in nodes:
- bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
- for bert_input in bert_inputs:
- if self.find_graph_input(bert_input):
- if not casted:
- graph_inputs.append(bert_input)
- elif bert_input in output_name_to_node:
- parent = output_name_to_node[bert_input]
- if (
- parent.op_type == "Cast"
- and self.find_graph_input(parent.input[0]) is not None
- ):
- if casted:
- graph_inputs.append(parent.input[0])
- return graph_inputs
-
- def get_graph_inputs_from_fused_nodes(self, casted: bool):
- inputs = self.get_graph_inputs_from_node_type(
- "EmbedLayerNormalization", [0, 1, 7], casted
- )
- inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
- return inputs
-
- def change_graph_input_type(
- self,
- graph: GraphProto,
- graph_input: ValueInfoProto,
- new_type: int = TensorProto.INT32,
- ):
- """Change graph input type, and add Cast node if needed.
-
- Args:
- graph (GraphProto): graph
- graph_input (TensorProto): input of the graph
- new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
- Returns:
- NodeProto: a new Cast node that added. None if Cast node is not added.
- List[NodeProto]: Cast nodes that have been removed.
- """
- assert isinstance(graph, GraphProto)
- assert isinstance(graph_input, ValueInfoProto)
- assert self.find_graph_input(graph_input.name)
-
- if graph_input.type.tensor_type.elem_type == int(new_type):
- return None, []
-
- new_cast_node = None
- nodes_to_remove = []
-
- input_name_to_nodes = self.input_name_to_nodes()
- if graph_input.name in input_name_to_nodes:
- nodes = input_name_to_nodes[graph_input.name]
-
- # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
- nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
- if nodes_not_cast:
- node_name = self.create_node_name("Cast")
- output_name = node_name + "_" + graph_input.name
- new_value_info = graph.value_info.add()
- new_value_info.CopyFrom(graph_input)
- new_value_info.name = output_name
- new_cast_node = helper.make_node(
- "Cast",
- [graph_input.name],
- [output_name],
- to=int(graph_input.type.tensor_type.elem_type),
- name=node_name,
- )
- graph.node.extend([new_cast_node])
-
- for node in nodes_not_cast:
- OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
- # For children that is Cast node, no need to insert Cast.
- # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
- nodes_cast = [node for node in nodes if node.op_type == "Cast"]
- for node in nodes_cast:
- if OnnxModel.get_node_attribute(node, "to") == int(new_type):
- self.replace_input_of_all_nodes(node.output[0], graph_input.name)
- if not self.find_graph_output(node.output[0]):
- nodes_to_remove.append(node)
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
-
- graph_input.type.tensor_type.elem_type = int(new_type)
- return new_cast_node, nodes_to_remove
-
- def change_graph_inputs_to_int32(self):
- """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
- graph = self.graph()
- add_cast_count = 0
- remove_cast_count = 0
- for graph_input in graph.input:
- new_node, removed_nodes = self.change_graph_input_type(
- graph, graph_input, TensorProto.INT32
- )
- if new_node:
- add_cast_count += 1
- remove_cast_count += len(removed_nodes)
- logger.info(
- f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
- )
-
- def use_dynamic_axes(
- self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
- ):
- """
- Update input and output shape to use dynamic axes.
- """
- bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
- casted=True
- ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
- dynamic_batch_inputs = {}
- for input in self.model.graph.input:
- if input.name in bert_graph_inputs:
- dim_proto = input.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
- if dynamic_seq_len is not None:
- dim_proto = input.type.tensor_type.shape.dim[1]
- dim_proto.dim_param = dynamic_seq_len
-
- for output in self.model.graph.output:
- dim_proto = output.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
-
- def preprocess(self):
- self.adjust_reshape_and_expand()
- return
-
- def adjust_reshape_and_expand(self):
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Reshape":
- # Clean up unneccessary reshape nodes.
- # Find reshape nodes with no actually data in "shape" attribute and remove.
- reshape_shape = self.get_constant_value(node.input[1])
- if reshape_shape is not None and reshape_shape.size == 0:
- nodes_to_remove.extend([node])
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- continue
-
- # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
- # changing current reshape's input to output of slice.
- reshape_path = self.match_parent_path(
- node,
- ["Expand", "Expand", "Reshape", "Slice"],
- [0, 0, 0, 0],
- self.output_name_to_node(),
- )
- if reshape_path is not None:
- expand_node = reshape_path[-3]
- expand_shape_value = self.get_constant_value(expand_node.input[1])
-
- reshape_before_expand = reshape_path[-2]
- shape_value = self.get_constant_value(
- reshape_before_expand.input[1]
- )
-
- slice_node = reshape_path[-1]
- if (
- expand_shape_value is not None
- and shape_value is not None
- and len(expand_shape_value) == 2
- and len(shape_value) == 1
- and expand_shape_value[1] == shape_value[0]
- ):
- node.input[0] = slice_node.output[0]
-
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
- logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
- def clean_graph(self):
- output_name_to_node = self.output_name_to_node()
- nodes_to_remove = []
- for node in self.nodes():
- # Before:
- # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
- # | |
- # | v
- # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # After:
- # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
- op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
- if node.op_type in op_input_id:
- i = op_input_id[node.op_type]
- parent_nodes = self.match_parent_path(
- node,
- [
- "Cast",
- "ConstantOfShape",
- "Concat",
- "Unsqueeze",
- "Gather",
- "Shape",
- ],
- [i, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- (
- cast,
- constantOfShape,
- concat,
- unsqueeze,
- gather,
- shape,
- ) = parent_nodes
- if shape.input[0] == self.graph().input[0].name:
- constantOfShape.input[0] = shape.output[0]
- output_name_to_node = self.output_name_to_node()
-
- if node.op_type == "Attention":
- # Before:
- # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
- # After:
- # remove this path, and remove the optional mask_index input of Attention node.
- parent_nodes = self.match_parent_path(
- node,
- ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- if parent_nodes[-1].input[0] == self.graph().input[0].name:
- attention_node = helper.make_node(
- "Attention",
- inputs=node.input[0 : len(node.input) - 1],
- outputs=node.output,
- name=node.name + "_remove_mask",
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", self.num_heads)]
- )
- self.add_node(
- attention_node, self.get_graph_by_node(attention_node).name
- )
- nodes_to_remove.append(node)
- self.remove_nodes(nodes_to_remove)
-
- def postprocess(self):
- self.clean_graph()
- self.prune_graph()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.preprocess()
-
- self.fuse_reshape()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- if options.enable_format_roformer:
- self.fuse_format_roformer()
-
- self.fuse_custom_fc_roformer()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- self.fuse_custom_fc()
-
- if (options is None) or options.enable_attention:
- if options is not None:
- self.attention_mask.set_mask_format(options.attention_mask_format)
- self.fuse_attention()
-
- self.fuse_rope()
-
- self.fuse_shape()
-
- # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
- self.utils.remove_useless_reshape_nodes()
-
- self.postprocess()
-
- # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
- if (options is None) or options.enable_bias_gelu:
- # Fuse Gelu and Add Bias before it.
- self.fuse_bias_gelu(is_fastgelu=True)
- self.fuse_bias_gelu(is_fastgelu=False)
-
- if (options is None) or options.enable_bias_skip_layer_norm:
- # Fuse SkipLayerNormalization and Add Bias before it.
- self.fuse_add_bias_skip_layer_norm()
-
- if options is not None and options.enable_gelu_approximation:
- self.gelu_approximation()
-
- self.fuse_custom_fc_activation()
-
- self.remove_unused_constant()
-
- # Use symbolic batch dimension in input and output.
- if add_dynamic_axes:
- self.use_dynamic_axes()
-
- logger.info(f"opset version: {self.get_opset_version()}")
-
- def get_fused_operator_statistics(self):
- """
- Returns node count of fused operators.
- """
- op_count = {}
- ops = [
- "EmbedLayerNormalization",
- "Attention",
- "QOrderedAttention",
- "Gelu",
- "QOrderedGelu",
- "FastGelu",
- "BiasGelu",
- "LayerNormalization",
- "QOrderedLayerNormalization",
- "SkipLayerNormalization",
- "QOrderedMatMul",
- ]
- for op in ops:
- nodes = self.get_nodes_by_op_type(op)
- op_count[op] = len(nodes)
- logger.info(f"Optimized operators:{op_count}")
- return op_count
-
- def is_fully_optimized(self):
- """
- Returns True when the model is fully optimized.
- """
- op_count = self.get_fused_operator_statistics()
- embed = op_count["EmbedLayerNormalization"]
- attention = op_count["Attention"] + op_count["QOrderedAttention"]
- gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
- layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
- is_perfect = (
- (embed > 0)
- and (attention > 0)
- and (attention == gelu)
- and (layer_norm >= 2 * attention)
- )
-
- if layer_norm == 0:
- logger.debug("Layer Normalization not fused")
-
- if gelu == 0:
- logger.debug("Gelu/FastGelu not fused")
-
- if embed == 0:
- logger.debug("Embed Layer not fused")
-
- if attention == 0:
- logger.warning("Attention not fused")
-
- return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
deleted file mode 100644
index dac070d24a66812c4b14cfeff5b7c78ff44c6711..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
+++ /dev/null
@@ -1,550 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_rms_norm import FusionRMSNorm
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
- FusionBiasSkipLayerNormalization,
- FusionSkipLayerNormalization,
-)
-from passes.fusion_splitQKV_update_KVcache import FusionSplitQKVUpdateKVCache
-from passes.fusion_t5_attention import (
- FusionT5DecoderAttention,
- FusionT5EncoderAttention,
-)
-from passes.fusion_utils import FusionUtils
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class BertOptimizationOptions(FusionOptions):
- """This class is deprecated"""
-
- def __init__(self, model_type):
- logger.warning(
- f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
- )
- super().__init__(model_type)
-
-
-class T5OnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads=12, hidden_size=768):
- """Initialize T5 ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
-
- super().__init__(model)
- self.num_heads = num_heads
- self.hidden_size = hidden_size
- self.attention_mask = AttentionMask(self)
- self.attention_fusion = FusionAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.qordered_attention_fusion = FusionQOrderedAttention(
- self, self.hidden_size, self.num_heads, self.attention_mask
- )
- self.utils = FusionUtils(self)
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_activation(self):
- fusion = FusionCustomFCActivation(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_bias_gelu(self, is_fastgelu):
- fusion = FusionBiasGelu(self, is_fastgelu)
- fusion.apply()
-
- def gelu_approximation(self):
- fusion = FusionGeluApproximation(self)
- fusion.apply()
-
- def fuse_add_bias_skip_layer_norm(self):
- fusion = FusionBiasSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_embed_layer(self):
- fusion = FusionEmbedLayerNormalization(self)
- fusion.apply()
-
- def fuse_rms_norm(self):
- fusion = FusionRMSNorm(self)
- fusion.apply()
-
- def fuse_t5_encoder_attention(self):
- fusion = FusionT5EncoderAttention(self)
- fusion.apply()
-
- def fuse_t5_decoder_attention(self):
- fusion = FusionT5DecoderAttention(self)
- fusion.apply()
- # pass
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalization(self, hidden_size=768)
- fusion.apply()
-
- fusion = FusionLayerNormalizationTF(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedLayerNormalization(self)
- fusion.apply()
-
- def fuse_skip_layer_norm(self):
- fusion = FusionSkipLayerNormalization(self)
- fusion.apply()
-
- def fuse_splitQKV_update_kv_cache(self):
- fusion = FusionSplitQKVUpdateKVCache(self, self.hidden_size, self.num_heads)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- def fuse_qordered_mamtul(self):
- fusion = FusionQOrderedMatMul(self)
- fusion.apply()
-
- def get_graph_inputs_from_node_type(
- self, op_type: str, input_indices: List[int], casted: bool
- ):
- """
- Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
- Returns a list of the graph input names based on the filter whether it is casted or not.
- """
- graph_inputs = []
-
- output_name_to_node = self.output_name_to_node()
- nodes = self.get_nodes_by_op_type(op_type)
- for node in nodes:
- bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
- for bert_input in bert_inputs:
- if self.find_graph_input(bert_input):
- if not casted:
- graph_inputs.append(bert_input)
- elif bert_input in output_name_to_node:
- parent = output_name_to_node[bert_input]
- if (
- parent.op_type == "Cast"
- and self.find_graph_input(parent.input[0]) is not None
- ):
- if casted:
- graph_inputs.append(parent.input[0])
- return graph_inputs
-
- def get_graph_inputs_from_fused_nodes(self, casted: bool):
- inputs = self.get_graph_inputs_from_node_type(
- "EmbedLayerNormalization", [0, 1, 7], casted
- )
- inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
- return inputs
-
- def change_graph_input_type(
- self,
- graph: GraphProto,
- graph_input: ValueInfoProto,
- new_type: int = TensorProto.INT32,
- ):
- """Change graph input type, and add Cast node if needed.
-
- Args:
- graph (GraphProto): graph
- graph_input (TensorProto): input of the graph
- new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
- Returns:
- NodeProto: a new Cast node that added. None if Cast node is not added.
- List[NodeProto]: Cast nodes that have been removed.
- """
- assert isinstance(graph, GraphProto)
- assert isinstance(graph_input, ValueInfoProto)
- assert self.find_graph_input(graph_input.name)
-
- if graph_input.type.tensor_type.elem_type == int(new_type):
- return None, []
-
- new_cast_node = None
- nodes_to_remove = []
-
- input_name_to_nodes = self.input_name_to_nodes()
- if graph_input.name in input_name_to_nodes:
- nodes = input_name_to_nodes[graph_input.name]
-
- # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
- nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
- if nodes_not_cast:
- node_name = self.create_node_name("Cast")
- output_name = node_name + "_" + graph_input.name
- new_value_info = graph.value_info.add()
- new_value_info.CopyFrom(graph_input)
- new_value_info.name = output_name
- new_cast_node = helper.make_node(
- "Cast",
- [graph_input.name],
- [output_name],
- to=int(graph_input.type.tensor_type.elem_type),
- name=node_name,
- )
- graph.node.extend([new_cast_node])
-
- for node in nodes_not_cast:
- OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
- # For children that is Cast node, no need to insert Cast.
- # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
- nodes_cast = [node for node in nodes if node.op_type == "Cast"]
- for node in nodes_cast:
- if OnnxModel.get_node_attribute(node, "to") == int(new_type):
- self.replace_input_of_all_nodes(node.output[0], graph_input.name)
- if not self.find_graph_output(node.output[0]):
- nodes_to_remove.append(node)
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
-
- graph_input.type.tensor_type.elem_type = int(new_type)
- return new_cast_node, nodes_to_remove
-
- def change_graph_inputs_to_int32(self):
- """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
- graph = self.graph()
- add_cast_count = 0
- remove_cast_count = 0
- for graph_input in graph.input:
- new_node, removed_nodes = self.change_graph_input_type(
- graph, graph_input, TensorProto.INT32
- )
- if new_node:
- add_cast_count += 1
- remove_cast_count += len(removed_nodes)
- logger.info(
- f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
- )
-
- def use_dynamic_axes(
- self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
- ):
- """
- Update input and output shape to use dynamic axes.
- """
- bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
- casted=True
- ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
- dynamic_batch_inputs = {}
- for input in self.model.graph.input:
- if input.name in bert_graph_inputs:
- dim_proto = input.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
- if dynamic_seq_len is not None:
- dim_proto = input.type.tensor_type.shape.dim[1]
- dim_proto.dim_param = dynamic_seq_len
-
- for output in self.model.graph.output:
- dim_proto = output.type.tensor_type.shape.dim[0]
- dim_proto.dim_param = dynamic_batch_dim
-
- def preprocess(self):
- self.adjust_reshape_and_expand()
- return
-
- def adjust_reshape_and_expand(self):
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Reshape":
- # Clean up unneccessary reshape nodes.
- # Find reshape nodes with no actually data in "shape" attribute and remove.
- reshape_shape = self.get_constant_value(node.input[1])
- if reshape_shape is not None and reshape_shape.size == 0:
- nodes_to_remove.extend([node])
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- continue
-
- # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
- # changing current reshape's input to output of slice.
- reshape_path = self.match_parent_path(
- node,
- ["Expand", "Expand", "Reshape", "Slice"],
- [0, 0, 0, 0],
- self.output_name_to_node(),
- )
- if reshape_path is not None:
- expand_node = reshape_path[-3]
- expand_shape_value = self.get_constant_value(expand_node.input[1])
-
- reshape_before_expand = reshape_path[-2]
- shape_value = self.get_constant_value(
- reshape_before_expand.input[1]
- )
-
- slice_node = reshape_path[-1]
- if (
- expand_shape_value is not None
- and shape_value is not None
- and len(expand_shape_value) == 2
- and len(shape_value) == 1
- and expand_shape_value[1] == shape_value[0]
- ):
- node.input[0] = slice_node.output[0]
-
- if nodes_to_remove:
- self.remove_nodes(nodes_to_remove)
- logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
- def clean_graph(self):
- output_name_to_node = self.output_name_to_node()
- nodes_to_remove = []
- for node in self.nodes():
- # Before:
- # input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
- # | |
- # | v
- # +----> Shape --> Gather(indices=1) --> Unsqueeze---> Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # After:
- # input_ids --> Shape --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
- # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
- op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
- if node.op_type in op_input_id:
- i = op_input_id[node.op_type]
- parent_nodes = self.match_parent_path(
- node,
- [
- "Cast",
- "ConstantOfShape",
- "Concat",
- "Unsqueeze",
- "Gather",
- "Shape",
- ],
- [i, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- (
- cast,
- constantOfShape,
- concat,
- unsqueeze,
- gather,
- shape,
- ) = parent_nodes
- if shape.input[0] == self.graph().input[0].name:
- constantOfShape.input[0] = shape.output[0]
- output_name_to_node = self.output_name_to_node()
-
- if node.op_type == "Attention":
- # Before:
- # input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
- # After:
- # remove this path, and remove the optional mask_index input of Attention node.
- parent_nodes = self.match_parent_path(
- node,
- ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if parent_nodes is not None:
- if parent_nodes[-1].input[0] == self.graph().input[0].name:
- attention_node = helper.make_node(
- "Attention",
- inputs=node.input[0 : len(node.input) - 1],
- outputs=node.output,
- name=node.name + "_remove_mask",
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", self.num_heads)]
- )
- self.add_node(
- attention_node, self.get_graph_by_node(attention_node).name
- )
- nodes_to_remove.append(node)
- self.remove_nodes(nodes_to_remove)
-
- def postprocess(self):
- self.clean_graph()
- self.prune_graph()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.preprocess()
-
- self.fuse_reshape()
-
- if (options is None) or options.enable_skip_layer_norm:
- self.fuse_skip_layer_norm()
-
- # Perform the MatMul fusion after the Attention fusion as we do not
- # want to fuse the MatMuls inside the Attention subgraphs
- if (options is None) or options.enable_qordered_matmul:
- self.fuse_qordered_mamtul()
-
- self.fuse_shape()
-
- self.fuse_rms_norm()
-
- self.fuse_t5_encoder_attention()
-
- self.fuse_t5_decoder_attention()
-
- self.fuse_splitQKV_update_kv_cache()
-
- if (options is None) or options.enable_embed_layer_norm:
- self.fuse_embed_layer()
-
- # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
- self.utils.remove_useless_reshape_nodes()
-
- self.postprocess()
-
- # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
- if (options is None) or options.enable_bias_gelu:
- # Fuse Gelu and Add Bias before it.
- self.fuse_bias_gelu(is_fastgelu=True)
- self.fuse_bias_gelu(is_fastgelu=False)
-
- if (options is None) or options.enable_bias_skip_layer_norm:
- # Fuse SkipLayerNormalization and Add Bias before it.
- self.fuse_add_bias_skip_layer_norm()
-
- if options is not None and options.enable_gelu_approximation:
- self.gelu_approximation()
-
- self.remove_unused_constant()
-
- # Use symbolic batch dimension in input and output.
- if add_dynamic_axes:
- self.use_dynamic_axes()
-
- logger.info(f"opset version: {self.get_opset_version()}")
-
- def get_fused_operator_statistics(self):
- """
- Returns node count of fused operators.
- """
- op_count = {}
- ops = [
- "EmbedLayerNormalization",
- "Attention",
- "QOrderedAttention",
- "Gelu",
- "QOrderedGelu",
- "FastGelu",
- "BiasGelu",
- "LayerNormalization",
- "QOrderedLayerNormalization",
- "SkipLayerNormalization",
- "QOrderedMatMul",
- ]
- for op in ops:
- nodes = self.get_nodes_by_op_type(op)
- op_count[op] = len(nodes)
- logger.info(f"Optimized operators:{op_count}")
- return op_count
-
- def is_fully_optimized(self):
- """
- Returns True when the model is fully optimized.
- """
- op_count = self.get_fused_operator_statistics()
- embed = op_count["EmbedLayerNormalization"]
- attention = op_count["Attention"] + op_count["QOrderedAttention"]
- gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
- layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
- is_perfect = (
- (embed > 0)
- and (attention > 0)
- and (attention == gelu)
- and (layer_norm >= 2 * attention)
- )
-
- if layer_norm == 0:
- logger.debug("Layer Normalization not fused")
-
- if gelu == 0:
- logger.debug("Gelu/FastGelu not fused")
-
- if embed == 0:
- logger.debug("Embed Layer not fused")
-
- if attention == 0:
- logger.warning("Attention not fused")
-
- return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
deleted file mode 100644
index 42b504c42edfc006b5efac0d385001780d296fb2..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-from onnx import ModelProto
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
- FusionFormatInvalidMask,
- FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_utils import FusionUtils
-from passes.fusion_yolov5_decoder import FusionYoloV5Decoder
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class YoloOnnxModel(OnnxModel):
- def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
- """Initialize BERT ONNX Model.
-
- Args:
- model (ModelProto): the ONNX model
- num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
- hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
- """
- assert (num_heads == 0 and hidden_size == 0) or (
- num_heads > 0 and hidden_size % num_heads == 0
- )
- super().__init__(model)
- self.utils = FusionUtils(self)
-
- def fuse_format_roformer(self):
- FusionRemoveUselessElementwise(self).apply()
- fusion = FusionFormatInvalidMask(self)
- fusion.apply()
-
- def fuse_custom_fc(self):
- fusion = FusionCustomFC(self)
- fusion.apply()
-
- def fuse_custom_fc_activation(self):
- fusion = FusionCustomFCActivation(self)
- fusion.apply()
-
- def fuse_swinT_serial_bias_add(self):
- fusion = FusionSerialBiasAdd(self)
- fusion.apply()
-
- def fuse_gelu(self):
- fusion = FusionGelu(self)
- fusion.apply()
- fusion = FusionFastGelu(self)
- fusion.apply()
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedGelu(self)
- fusion.apply()
-
- def fuse_reshape(self):
- fusion = FusionReshape(self)
- fusion.apply()
-
- def fuse_shape(self):
- fusion = FusionShape(self)
- fusion.apply()
-
- def fuse_layer_norm(self):
- fusion = FusionLayerNormalization(self, 0)
- fusion.apply()
-
- fusion = FusionLayerNormalizationTF(self)
- fusion.apply()
-
- # Only relevant in models with Q-DQ nodes
- fusion = FusionQOrderedLayerNormalization(self)
- fusion.apply()
-
- def optimize(
- self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
- ):
- if (options is not None) and not options.enable_shape_inference:
- self.disable_shape_inference()
-
- self.utils.remove_identity_nodes()
-
- # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
- self.utils.remove_useless_cast_nodes()
-
- if (options is None) or options.enable_layer_norm:
- self.fuse_layer_norm()
-
- if (options is None) or options.enable_gelu:
- self.fuse_gelu()
-
- self.fuse_reshape()
-
- FusionYoloV5Decoder(self).apply()
- self.remove_unused_constant()
- logger.info(f"opset version: {self.get_opset_version()}")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
deleted file mode 100644
index dc823d366b327141bd5646e7d3aef153349cea8e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# IxRT optimizer
-
-## 1. optimizer 简介
-`optimizer` 是一个 ixrt 中集成的图融合工具,用于将onnx图中的op融合成对应的ixrt plugin;
-
-## 2. optimizer 功能说明
-| 功能 | 说明 |
-| -------------- | ---- |
-| 多 batchsize 支持 | 支持设置不同 batchsize 进行推理测试 |
-| 动态图支持 | 支持融合动态图和静态图 |
-| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert等模型 |
-
-## 3. optimizer 运行参数
-| 参数 | 说明 |
-| -------------- | ---- |
-| `--onnx` | 必选 ,指定要运行的 onnx 模型路径 |
-| `--num_heads` | 可选 ,指定模型对应Attention模块注意力头的个数 |
-|`--hidden_size` | 可选, 模型模型隐藏层的大小|
-|`--input_shapes` | 可选 ,指定模型输入数据类型,示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
-| `--dump_onnx` | 可选 ,用于图融合过程中dump出中间的onnx图 |
-|`--model_type` | 可选 ,可以指定要融合的模型类型,默认是"bert", 可选["bert", "swint", "roformer"]|
-|`--log_level` |可选 ,指定ixrt运行时显示日志的等级, 可指定为debug、info、error,默认为 info|
-
-
-## 4. 运行示例
-
-### 4.1 示例1:融合albert|videobert|roberta|deberta
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH}
-```
-
-### 4.2 示例2:融合swinL
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
-```
-
-### 4.3 示例3:融合roformer
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
-```
-
-### 4.4 精度验证
-
-请参考[高级话题](5_advanced_topics.md)中的精度对比工具一节,了解详细使用方法和原理。
-
-也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
-
-具体使用方法可以参考oss/samples
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
deleted file mode 100644
index 0f301e3a58e14713c7ebb26342a6fb39ecdca80e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-import argparse
-import logging
-import time
-from typing import Dict, Optional
-
-import onnx
-from onnx import ModelProto, helper, load_model
-from onnx_model_bert import BertOnnxModel
-from onnx_model_roformer import RoformerOnnxModel
-from onnx_model_conformer import conformerOnnxModel
-from onnx_model_t5 import T5OnnxModel
-from onnx_model_yolo import YoloOnnxModel
-from onnx_model_PVT import PVTOnnxModel
-from onnx_model_cosyvoice import cosyvoiceOnnxModel
-
-
-from onnxsim import simplify
-from passes.fusion_options import FusionOptions
-from passes.symbolic_shape_infer import SymbolicShapeInference
-
-logger = logging.getLogger(__name__)
-MODEL_TYPES = {
- "bert": (BertOnnxModel, None, "pytorch", 1),
- "swint": (BertOnnxModel, None, "pytorch", 1),
- "roformer": (RoformerOnnxModel, None, "tf2onnx", 1),
- "gpt2": (BertOnnxModel, None, "pytorch", 1),
- "t5": (T5OnnxModel, None, "tf2onnx", 1),
- "yolo": (YoloOnnxModel, None, "pytorch", 1),
- "vit": (BertOnnxModel, None, "pytorch", 1),
- "conformer": (conformerOnnxModel, None, "pytorch", 1),
- "PVT": (PVTOnnxModel, None, "pytorch", 1),
- "omdet": (BertOnnxModel, None, "pytorch", 1),
- "cosyvoice": (cosyvoiceOnnxModel, None, "pytorch", 1)
-
-}
-
-
-def optimize_by_fusion(
- model: ModelProto,
- model_type: str = "bert",
- num_heads: int = 0,
- hidden_size: int = 0,
- optimization_options: Optional[FusionOptions] = None,
-):
- """Optimize Model by graph fusion logic.
-
- Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
- constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.
-
- For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
-
- Args:
- model (ModelProto): model object
- model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
- num_heads (int, optional): number of attention heads. Defaults to 0.
- 0 allows detect the parameter from graph automatically (for model_type "bert" only).
- hidden_size (int, optional): hidden size. Defaults to 0.
- 0 allows detect the parameter from graph automatically (for model_type "bert" only).
- optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None.
-
- Returns:
- object of an optimizer class.
- """
- if model_type != "bert" and (num_heads == 0 or hidden_size == 0):
- logger.warning(
- "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'"
- )
-
- (optimizer_class, transformer_class, producer, _) = MODEL_TYPES[model_type]
-
- if model.producer_name and producer != model.producer_name:
- logger.warning(
- f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".'
- "Please specify correct --model_type parameter."
- )
-
- if optimization_options is None:
- optimization_options = FusionOptions(model_type)
-
- optimizer = optimizer_class(model, num_heads, hidden_size)
-
- optimizer.optimize(optimization_options)
-
- optimizer.topological_sort()
-
- return optimizer, transformer_class
-
-
-def optimize_to_ixrt(args):
- onnx_name = args.onnx[:-5]
- model = onnx.load(args.onnx)
- if not args.not_sim:
- logger.info("simplify..")
- simplified_model, check = simplify(model)
- logger.info("simplify model end...")
- if args.dump_onnx:
- onnx.save(simplified_model, onnx_name + "_sim.onnx")
-
- # transfer to static shape and optimize it
- static_sim_model = simplified_model
- if args.input_shapes:
- for input_tensor in simplified_model.graph.input:
- if input_tensor.name in args.input_shapes.keys():
- new_shape = args.input_shapes[input_tensor.name]
- dim_list = []
- for dim in new_shape:
- if isinstance(dim, int):
- dim_proto = onnx.TensorShapeProto.Dimension()
- dim_proto.dim_value = dim
- dim_list.append(dim_proto)
- elif isinstance(dim, str):
- dim_proto = onnx.TensorShapeProto.Dimension()
- dim_proto.dim_param = dim
- dim_list.append(dim_proto)
-
- del input_tensor.type.tensor_type.shape.dim[:]
- input_tensor.type.tensor_type.shape.dim.extend(dim_list)
-
- try:
- auto_merge = False
- if args.model_type in ["roformer"]:
- auto_merge = True
- static_model = SymbolicShapeInference.infer_shapes(
- simplified_model, 2**31 - 1, auto_merge, False, 3
- )
- static_sim_model, check = simplify(static_model)
- if args.dump_onnx:
- onnx.save(static_sim_model, onnx_name + "_sim_static_sim.onnx")
- except Exception as e:
- static_model = static_sim_model = simplified_model
-
- if args.dump_onnx:
- onnx.save(static_model, onnx_name + "_sim_static.onnx")
- if args.not_sim:
- static_sim_model = model
-
- logger.info("start fusion..")
- opt_model, _ = optimize_by_fusion(
- static_sim_model, args.model_type, args.num_heads, args.hidden_size
- )
- opt_model.save_model_to_file(onnx_name + "_end.onnx")
- logger.info("done..")
-
-
-def parse_params(params_str):
- params = {}
- for item in params_str.replace(" ", "").split(","):
- key, value = item.split(":")
- params[key] = [int(x) if x.isdigit() else x for x in value.split("x")]
- return params
-
-
-def args_parser():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--onnx", type=str, default=None, required=False, help="ONNX model file path"
- )
- parser.add_argument(
- "--num_heads",
- type=int,
- default=0,
- help="Used in model optimization. The num of the head used in the network",
- )
- parser.add_argument(
- "--hidden_size",
- type=int,
- default=0,
- help="Used in model optimization. The hidden_size used in the network",
- )
- parser.add_argument(
- "--input_shapes",
- type=parse_params,
- help='Static input_shapes to the inference, format is --input_shapes "input_name1:3x224x224, input_name2:3x224x224"',
- )
- parser.add_argument(
- "--dump_onnx",
- action="store_true",
- help="Whether to dump onnx",
- )
- parser.add_argument(
- "--model_type",
- type=str,
- default="bert",
- choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2", "vit", "conformer","PVT","omdet","cosyvoice"],
- help="Which kind of model to optimize",
- )
- parser.add_argument(
- "--log_level",
- type=str,
- default="info",
- choices=["debug", "info", "error"],
- help="Which kind of model to optimize",
- )
-
- parser.add_argument(
- "--not_sim",
- action="store_true",
- default=False,
- help="simplify model or not",
- )
- return parser.parse_args()
-
-
-if __name__ == "__main__":
- args = args_parser()
- if args.log_level == "info":
- logging.basicConfig(level=logging.INFO)
- elif args.log_level == "debug":
- logging.basicConfig(level=logging.DEBUG)
- else:
- logging.basicConfig(level=logging.ERROR)
- optimize_to_ixrt(args)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
deleted file mode 100644
index de522e5b082b122a28b0a0423a40909598aa82d5..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
deleted file mode 100644
index 96da8751b0200bb8610e3dd5070f26ebc51e97ac..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
+++ /dev/null
@@ -1,477 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
-# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
-
-import itertools
-import logging
-from typing import Dict, List
-
-import numpy as np
-import onnx
-from onnx import helper, numpy_helper
-from onnx import onnx_pb as onnx_proto
-from packaging import version
-
-logger = logging.getLogger(__name__)
-
-
-def _npfloat16_to_int(np_list):
- """
- Convert numpy float16 to python int.
-
- :param np_list: numpy float16 list
- :return int_list: python int list
- """
- return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
-
-
-def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
- """
- Convert float32 numpy array to float16 without changing sign or finiteness.
- Positive values less than min_positive_val are mapped to min_positive_val.
- Positive finite values greater than max_finite_val are mapped to max_finite_val.
- Similar for negative values. NaN, 0, inf, and -inf are unchanged.
- """
-
- def between(a, b, c):
- return np.logical_and(a < b, b < c)
-
- np_array = np.where(
- between(0, np_array, min_positive_val), min_positive_val, np_array
- )
- np_array = np.where(
- between(-min_positive_val, np_array, 0), -min_positive_val, np_array
- )
- np_array = np.where(
- between(max_finite_val, np_array, float("inf")), max_finite_val, np_array
- )
- np_array = np.where(
- between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array
- )
- return np.float16(np_array)
-
-
-def convert_tensor_float_to_float16(
- tensor, min_positive_val=5.96e-08, max_finite_val=65504.0
-):
- """Convert tensor float to float16.
-
- Args:
- tensor (TensorProto): the tensor to convert.
- min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
- max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
-
- Raises:
- ValueError: input type is not TensorProto.
-
- Returns:
- TensorProto: the converted tensor.
- """
-
- if not isinstance(tensor, onnx_proto.TensorProto):
- raise ValueError(
- "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
- )
-
- if tensor.data_type == onnx_proto.TensorProto.FLOAT:
- tensor.data_type = onnx_proto.TensorProto.FLOAT16
- # convert float_data (float type) to float16 and write to int32_data
- if tensor.float_data:
- float16_data = convert_np_to_float16(
- np.array(tensor.float_data), min_positive_val, max_finite_val
- )
- int_list = _npfloat16_to_int(float16_data)
- tensor.int32_data[:] = int_list
- tensor.float_data[:] = []
- # convert raw_data (bytes type)
- if tensor.raw_data:
- # convert n.raw_data to float
- float32_list = np.frombuffer(tensor.raw_data, dtype="float32")
- # convert float to float16
- float16_list = convert_np_to_float16(
- float32_list, min_positive_val, max_finite_val
- )
- # convert float16 to bytes and write back to raw_data
- tensor.raw_data = float16_list.tobytes()
- return tensor
-
-
-def make_value_info_from_tensor(tensor):
- shape = numpy_helper.to_array(tensor).shape
- return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
-
-
-DEFAULT_OP_BLOCK_LIST = [
- "ArrayFeatureExtractor",
- "Binarizer",
- "CastMap",
- "CategoryMapper",
- "DictVectorizer",
- "FeatureVectorizer",
- "Imputer",
- "LabelEncoder",
- "LinearClassifier",
- "LinearRegressor",
- "Normalizer",
- "OneHotEncoder",
- "SVMClassifier",
- "SVMRegressor",
- "Scaler",
- "TreeEnsembleClassifier",
- "TreeEnsembleRegressor",
- "ZipMap",
- "NonMaxSuppression",
- "TopK",
- "RoiAlign",
- "Resize",
- "Range",
- "CumSum",
- "Min",
- "Max",
- "Upsample",
-]
-
-
-class InitializerTracker:
- """Class for keeping track of initializer."""
-
- def __init__(self, initializer: onnx_proto.TensorProto):
- self.initializer = initializer
- self.fp32_nodes = []
- self.fp16_nodes = []
-
- def add_node(self, node: onnx_proto.NodeProto, is_node_blocked):
- if is_node_blocked:
- self.fp32_nodes.append(node)
- else:
- self.fp16_nodes.append(node)
-
-
-def convert_float_to_float16(
- model,
- min_positive_val=5.96e-08,
- max_finite_val=65504.0,
- keep_io_types=False,
- disable_shape_infer=False,
- op_block_list=None,
- node_block_list=None,
- force_fp16_initializers=False,
-):
- """Convert model tensor float type in the ONNX ModelProto input to tensor float16.
-
- Args:
- model (ModelProto): The ONNX model to convert.
- min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.
- max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.
- keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
- If True, model inputs/outputs should be left as float32. Defaults to False.
- disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False.
- op_block_list (List[str], optional): List of op types to leave as float32.
- Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
- node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
- force_fp16_initializers(bool): force converting all float initializers to float16.
- Default to false, which will convert only the one needed to avoid precision loss.
- Raises:
- ValueError: input type is not ModelProto.
-
- Returns:
- ModelProto: converted model.
- """
- assert (
- min_positive_val >= 5.96e-08
- ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
- assert max_finite_val <= float(
- np.finfo(np.float16).max
- ), "invalid max_finite_val. largest float16 value: 65504"
-
- func_infer_shape = None
- if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse(
- "1.2.0"
- ):
- try:
- from onnx.shape_inference import infer_shapes
-
- func_infer_shape = infer_shapes
- finally:
- pass
-
- if not isinstance(model, onnx_proto.ModelProto):
- raise ValueError(
- "Expected model type is an ONNX ModelProto but got %s" % type(model)
- )
-
- # create blocklists
- if op_block_list is None:
- op_block_list = DEFAULT_OP_BLOCK_LIST
- if node_block_list is None:
- node_block_list = []
- op_block_list = set(op_block_list)
- node_block_list = set(node_block_list)
-
- logger.debug(
- f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"
- )
-
- # create a queue for BFS
- queue = []
- value_info_list = []
- node_list = []
- # type inference on input model
- if func_infer_shape is not None:
- model = func_infer_shape(model)
- queue.append(model)
- name_mapping = {}
- graph_io_to_skip = set()
- io_casts = set()
-
- fp32_inputs = [
- n.name
- for n in model.graph.input
- if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT
- ]
- fp32_outputs = [
- n.name
- for n in model.graph.output
- if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT
- ]
- if isinstance(keep_io_types, list):
- fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
- fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
- elif not keep_io_types:
- fp32_inputs = []
- fp32_outputs = []
-
- for i, n in enumerate(model.graph.input):
- if n.name in fp32_inputs:
- output_name = "graph_input_cast_" + str(i)
- name_mapping[n.name] = output_name
- graph_io_to_skip.add(n.name)
-
- node_name = "graph_input_cast" + str(i)
- new_value_info = model.graph.value_info.add()
- new_value_info.CopyFrom(n)
- new_value_info.name = output_name
- new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
- # add Cast node (from tensor(float) to tensor(float16) after graph input
- new_node = [
- helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)
- ]
- model.graph.node.extend(new_node)
- value_info_list.append(new_value_info)
- io_casts.add(node_name)
-
- for i, n in enumerate(model.graph.output):
- if n.name in fp32_outputs:
- input_name = "graph_output_cast_" + str(i)
- name_mapping[n.name] = input_name
- graph_io_to_skip.add(n.name)
-
- node_name = "graph_output_cast" + str(i)
- # add Cast node (from tensor(float16) to tensor(float) before graph output
- new_value_info = model.graph.value_info.add()
- new_value_info.CopyFrom(n)
- new_value_info.name = input_name
- new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
- new_node = [
- helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)
- ]
- model.graph.node.extend(new_node)
- value_info_list.append(new_value_info)
- io_casts.add(node_name)
-
- fp32_initializers: Dict[str, InitializerTracker] = {}
- while queue:
- next_level = []
- for q in queue:
- # if q is model, push q.graph (GraphProto)
- if isinstance(q, onnx_proto.ModelProto):
- next_level.append(q.graph)
- # if q is model.graph, push q.node.attribute (AttributeProto)
- if isinstance(q, onnx_proto.GraphProto):
- for n in q.initializer: # TensorProto type
- if n.data_type == onnx_proto.TensorProto.FLOAT:
- assert n.name not in fp32_initializers
- fp32_initializers[n.name] = InitializerTracker(n)
-
- for n in q.node:
- # if n is in the block list (doesn't support float16), no conversion for the node,
- # and save the node for further processing
- if n.name in io_casts:
- continue
- for i in range(len(n.input)):
- if n.input[i] in name_mapping:
- n.input[i] = name_mapping[n.input[i]]
- for i in range(len(n.output)):
- if n.output[i] in name_mapping:
- n.output[i] = name_mapping[n.output[i]]
-
- is_node_blocked = (
- n.op_type in op_block_list or n.name in node_block_list
- )
- for input in n.input:
- if input in fp32_initializers:
- fp32_initializers[input].add_node(n, is_node_blocked)
-
- if is_node_blocked:
- node_list.append(n)
- else:
- if n.op_type == "Cast":
- for attr in n.attribute:
- if attr.name == "to" and attr.i == 1:
- attr.i = 10
- break
- for attr in n.attribute:
- next_level.append(attr)
- # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
- # and process node.attribute.t and node.attribute.tensors (TensorProto)
- if isinstance(q, onnx_proto.AttributeProto):
- next_level.append(q.g)
- for n in q.graphs:
- next_level.append(n)
- q.t.CopyFrom(
- convert_tensor_float_to_float16(
- q.t, min_positive_val, max_finite_val
- )
- )
- for n in q.tensors:
- n = convert_tensor_float_to_float16(
- n, min_positive_val, max_finite_val
- )
- # if q is graph, process input, output and value_info (ValueInfoProto)
- if isinstance(q, onnx_proto.GraphProto):
- # Note that float initializers tracked by fp32_initializers will be processed later.
- # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
- # tensor(float16) except map and seq(map). And save them in value_info_list for further processing
- for n in itertools.chain(q.input, q.output, q.value_info):
- if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
- if n.name not in graph_io_to_skip:
- n.type.tensor_type.elem_type = (
- onnx_proto.TensorProto.FLOAT16
- )
- value_info_list.append(n)
- if n.type.HasField("sequence_type"):
- if (
- n.type.sequence_type.elem_type.tensor_type.elem_type
- == onnx_proto.TensorProto.FLOAT
- ):
- if n.name not in graph_io_to_skip:
- n.type.sequence_type.elem_type.tensor_type.elem_type = (
- onnx_proto.TensorProto.FLOAT16
- )
- value_info_list.append(n)
-
- queue = next_level
-
- for key, value in fp32_initializers.items():
- # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
- if force_fp16_initializers or value.fp16_nodes:
- value.initializer = convert_tensor_float_to_float16(
- value.initializer, min_positive_val, max_finite_val
- )
- value_info_list.append(make_value_info_from_tensor(value.initializer))
- if value.fp32_nodes and not force_fp16_initializers:
- logger.info(
- "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
- value.fp16_nodes
- )
- )
-
- # process the nodes in block list that doesn't support tensor(float16)
- for node in node_list:
- # if input's name is in the value_info_list meaning input is tensor(float16) type,
- # insert a float16 to float Cast node before the node,
- # change current node's input name and create new value_info for the new name
- for i in range(len(node.input)):
- input = node.input[i]
- for value_info in value_info_list:
- if input == value_info.name:
- # create new value_info for current node's new input name
- new_value_info = model.graph.value_info.add()
- new_value_info.CopyFrom(value_info)
- output_name = node.name + "_input_cast_" + str(i)
- new_value_info.name = output_name
- new_value_info.type.tensor_type.elem_type = (
- onnx_proto.TensorProto.FLOAT
- )
- # add Cast node (from tensor(float16) to tensor(float) before current node
- node_name = node.name + "_input_cast" + str(i)
- new_node = [
- helper.make_node(
- "Cast", [input], [output_name], to=1, name=node_name
- )
- ]
- model.graph.node.extend(new_node)
- # change current node's input name
- node.input[i] = output_name
- break
- # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
- # float16 Cast node after the node, change current node's output name and create new value_info for the new name
- for i in range(len(node.output)):
- output = node.output[i]
- for value_info in value_info_list:
- if output == value_info.name:
- # create new value_info for current node's new output
- new_value_info = model.graph.value_info.add()
- new_value_info.CopyFrom(value_info)
- input_name = node.name + "_output_cast_" + str(i)
- new_value_info.name = input_name
- new_value_info.type.tensor_type.elem_type = (
- onnx_proto.TensorProto.FLOAT
- )
- # add Cast node (from tensor(float) to tensor(float16) after current node
- node_name = node.name + "_output_cast" + str(i)
- new_node = [
- helper.make_node(
- "Cast", [input_name], [output], to=10, name=node_name
- )
- ]
- model.graph.node.extend(new_node)
- # change current node's input name
- node.output[i] = input_name
- break
- return model
-
-
-def float_to_float16_max_diff(
- tensor, min_positive_val=5.96e-08, max_finite_val=65504.0
-):
- """Measure the maximum absolute difference after converting a float tensor to float16."""
- if not isinstance(tensor, onnx_proto.TensorProto):
- raise ValueError(
- "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
- )
- if tensor.data_type != onnx_proto.TensorProto.FLOAT:
- raise ValueError("Expected tensor data type is float.")
-
- float32_data = None
- if tensor.float_data:
- float32_data = np.array(tensor.float_data)
-
- if tensor.raw_data:
- float32_data = np.frombuffer(tensor.raw_data, dtype="float32")
-
- if float32_data is None:
- raise RuntimeError("external data not loaded!")
-
- float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
- return np.amax(np.abs(float32_data - np.float32(float16_data)))
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py
deleted file mode 100644
index 9862d9ee4bee8da619750b2544ddc48d35be0fa9..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py
+++ /dev/null
@@ -1,85 +0,0 @@
-
-from logging import getLogger
-from typing import Dict
-
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-class FusionLayerInverseSigmoid(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "InverseSigmoid", "Clip"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- +------------Clip-----------+
- | |
- | v
- [Root] --> Clip--> Sub --> Clip --> Div --> Log
- """
- children = self.model.get_children(node, input_name_to_nodes)
- if len(children) != 2:
- return
-
- root_input = node.input[0]
-
- if not ((children[0].op_type == "Sub" and children[1].op_type == "Clip") or (children[0].op_type == "Clip" and children[1].op_type == "Sub")):
- return
-
- log_node = None
- for child in children:
- log_node = self.model.find_first_child_by_type(
- child, "Log", input_name_to_nodes, recursive=True
- )
- if log_node is not None:
- break
- if log_node is None:
- return
- parent_nodes = self.model.match_parent_path(
- log_node,
- ["Div", "Clip", "Sub", "Clip"],
- [0, 1, 0, 1],
- output_name_to_node,
- )
- if parent_nodes is None:
- return
-
- sub_node = parent_nodes[2]
- if sub_node not in children:
- return
-
- div_node = parent_nodes[0]
- div_parents_nodes = self.model.get_parents(div_node)
- if len(div_parents_nodes) != 2:
- return
- if div_parents_nodes[0].op_type != "Clip":
- return
- if div_parents_nodes[0] not in children:
- return
-
- subgraph_nodes = [node]
- subgraph_nodes.extend([log_node])
- subgraph_nodes.extend(parent_nodes)
- subgraph_nodes.extend([div_parents_nodes[0]])
- _, eps_val = self.model.get_constant_input(div_parents_nodes[0])
-
- self.nodes_to_remove.extend(subgraph_nodes)
- inverse_sigmoid_node = helper.make_node(
- "InverseSigmoid",
- inputs=[node.input[0]],
- outputs=[log_node.output[0]],
- name=self.model.create_node_name(
- "InverseSigmoid", name_prefix="InverseSigmoid"
- ),
- )
- inverse_sigmoid_node.attribute.extend(
- [helper.make_attribute("epsilon", float(eps_val))]
- )
- self.nodes_to_add.append(inverse_sigmoid_node)
- self.node_name_to_graph_name[inverse_sigmoid_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py
deleted file mode 100644
index bfd1ed28eb8b0f3d7c65b1e31da8c1dc45415ce7..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from logging import getLogger
-from typing import Dict
-
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-class FusionLayerL2Normalization(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "L2Normalization", "Abs"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- +-------------------------------------------------------+
- | |
- | v
- [Root] --> Abs--> Pow --> ReduceSum --> Pow --> Clip --> Div
- """
- pow1_nodes = self.model.get_children(node, input_name_to_nodes)
- if len(pow1_nodes) != 1 or pow1_nodes[0].op_type != "Pow":
- return
-
- reduce_nodes = self.model.get_children(pow1_nodes[0], input_name_to_nodes)
- if len(reduce_nodes) != 1 or reduce_nodes[0].op_type != "ReduceSum":
- return
-
- pow2_nodes = self.model.get_children(reduce_nodes[0], input_name_to_nodes)
- if len(pow2_nodes) != 1 or pow2_nodes[0].op_type != "Pow":
- return
-
- clip_nodes = self.model.get_children(pow2_nodes[0], input_name_to_nodes)
- if len(clip_nodes) != 1 or clip_nodes[0].op_type != "Clip":
- return
-
- div_nodes = self.model.get_children(clip_nodes[0], input_name_to_nodes)
- if len(div_nodes) != 1 or div_nodes[0].op_type != "Div":
- return
-
- root_input = node.input[0]
- if div_nodes[0].input[0] != root_input:
- return
-
- subgraph_nodes = [node, pow1_nodes[0], reduce_nodes[0], pow2_nodes[0], clip_nodes[0], div_nodes[0]]
- _, eps_val = self.model.get_constant_input(clip_nodes[0])
- _, norm_axes = self.model.get_constant_input(reduce_nodes[0])
- norm_axes = norm_axes.astype(np.int32)
-
- self.nodes_to_remove.extend(subgraph_nodes)
- l2_normalization_node = helper.make_node(
- "L2Normalization",
- inputs=[node.input[0]],
- outputs=[div_nodes[0].output[0]],
- name=self.model.create_node_name(
- "L2Normalization", name_prefix="L2Normalization"
- ),
- )
- l2_normalization_node.attribute.extend(
- [helper.make_attribute("epsilon", float(eps_val)),
- helper.make_attribute("axes", norm_axes),
- helper.make_attribute("axes_length", int(norm_axes.size))]
- )
- self.nodes_to_add.append(l2_normalization_node)
- self.node_name_to_graph_name[l2_normalization_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py
deleted file mode 100644
index 3451731f835ef05d8e61e0b5da2ef724be808f17..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py
+++ /dev/null
@@ -1,149 +0,0 @@
-
-from logging import getLogger
-from typing import Dict
-
-import math
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-class FusionLayerOmdetAttention(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "CustomQKVToContextPluginDynamic_IxRT", "CustomFCPluginDynamic_IxRT"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- [Root] --> CustomFCPluginDynamic_IxRT--> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT
- """
- children = self.model.get_children(node, input_name_to_nodes)
- parent = self.model.get_parents(node, output_name_to_node)
-
- if len(children) != 1:
- return
- if len(parent) != 1:
- return
-
- fc_first_node = None
- for par in parent:
- fc_first_node = self.model.find_first_parent_by_type(
- par, "CustomFCPluginDynamic_IxRT", output_name_to_node, recursive=True
- )
- if fc_first_node is not None:
- break
- if fc_first_node is None:
- return
-
- start_node = node
-
- # v path
- v_nodes = self.model.match_parent_path(
- start_node,
- ["Reshape", "Transpose", "MatMul", "Gather", "Transpose", "Reshape"],
- [0, 0, 0, 1, 0, 0],
- output_name_to_node,
- )
-
- # path1, q and k path
- q_nodes = self.model.match_parent_path(
- start_node,
- ["Reshape", "Transpose", "MatMul", "Softmax", "Add", "MatMul", "Transpose", "Gather", "Transpose", "Reshape"],
- [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
- output_name_to_node,
- )
-
- k_nodes = self.model.match_parent_path(
- start_node,
- ["Reshape", "Transpose", "MatMul", "Softmax", "Add", "MatMul", "Mul", "Gather", "Transpose", "Reshape"],
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- # path2, q and k path
- q_nodes_1 = self.model.match_parent_path(
- start_node,
- ["Reshape", "Transpose", "MatMul", "Softmax", "Reshape", "Add", "Reshape", "Add", "MatMul", "Transpose", "Gather", "Transpose", "Reshape"],
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
- output_name_to_node,
- )
-
- k_nodes_1 = self.model.match_parent_path(
- start_node,
- ["Reshape", "Transpose", "MatMul", "Softmax", "Reshape", "Add", "Reshape", "Add", "MatMul", "Mul", "Gather", "Transpose", "Reshape"],
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- if v_nodes is None:
- return
-
- if v_nodes and q_nodes and k_nodes:
- subgraph_nodes = []
- subgraph_nodes.extend(q_nodes)
- subgraph_nodes.extend(k_nodes)
- subgraph_nodes.extend(v_nodes)
-
- subgraph_nodes_unique = []
- for item in subgraph_nodes:
- if item not in subgraph_nodes_unique:
- subgraph_nodes_unique.append(item)
-
- add_node = q_nodes[4]
- hidden_size = start_node.attribute[0].i
- _, mul_val = self.model.get_constant_input(k_nodes[6])
- num_heads = hidden_size // math.floor((1/mul_val)*(1/ mul_val))
- attention_input_1_name = add_node.input[1]
-
- if v_nodes and q_nodes_1 and k_nodes_1:
- subgraph_nodes = []
- subgraph_nodes.extend(q_nodes_1)
- subgraph_nodes.extend(k_nodes_1)
- subgraph_nodes.extend(v_nodes)
-
- subgraph_nodes_unique = []
- for item in subgraph_nodes:
- if item not in subgraph_nodes_unique:
- subgraph_nodes_unique.append(item)
-
- hidden_size = start_node.attribute[0].i
- _, mul_val = self.model.get_constant_input(k_nodes_1[9])
- num_heads = hidden_size // math.floor((1/mul_val)*(1/ mul_val))
-
- add_1 = self.model.get_initializer(q_nodes_1[5].input[1], True)
- add_2 = self.model.get_initializer(q_nodes_1[7].input[1], True)
- add_all = np.squeeze(add_1 + add_2)
-
- attention_input_1_name = "attention_" + q_nodes_1[5].input[1]
- attention_input_1 = helper.make_tensor(
- attention_input_1_name, TensorProto.FLOAT, add_all.shape, add_all.flatten().tolist())
-
- self.model.add_initializer(attention_input_1, self.this_graph_name)
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=[fc_first_node.output[0], attention_input_1_name],
- outputs=[start_node.input[0]],
- name=self.model.create_node_name(
- "OmdetAttention", name_prefix="OmdetAttention"
- ),
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
- attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-
- self.nodes_to_remove.extend(subgraph_nodes_unique)
-
- self.nodes_to_add.append(attention_node)
- self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-
-
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
deleted file mode 100644
index bb9a1cab034aaf714b416ea971ac9e6d69884894..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-from logging import getLogger
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSerialBiasAdd(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "Add", "Softmax")
-
- def match_parent_path_from_dict(self, start_node, path_dict):
- res_path = None
- res_nodes = None
- for k, v in path_dict.items():
- res_nodes = self.model.match_parent_path(start_node, v[0], v[1])
- if res_nodes is None:
- continue
- return res_nodes, k
- return res_nodes, res_path
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- paths = {
- "path1": (["Reshape", "Add", "Reshape", "Add"], [0, 0, 0, 0]),
- }
- series_nodes, path_chosen = self.match_parent_path_from_dict(node, paths)
- if not series_nodes:
- return
- last_reshape, add_2nd, _, add_1st = series_nodes
-
- biases = [
- self.model.get_initializer(add_1st.input[1]),
- self.model.get_initializer(add_2nd.input[1]),
- ]
- if not all(biases):
- return
-
- bias_arr_1st = NumpyHelper.to_array(biases[0])
- bias_arr_2nd = NumpyHelper.to_array(biases[1]).squeeze(0)
- try:
- relative_position_bias = bias_arr_1st + bias_arr_2nd
- except Exception as e:
- print("Two bias are unrelated:", e)
- return
-
- # Fuse
- add_name = self.model.create_node_name("Add", "Add")
- B = biases[0]
- B.CopyFrom(numpy_helper.from_array(relative_position_bias, B.name))
-
- fused_node = helper.make_node(
- "Add",
- inputs=[add_1st.input[0], B.name],
- outputs=last_reshape.output,
- name=add_name,
- )
- fused_node.domain = "com.iluvatar"
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- self.nodes_to_add.append(fused_node)
- self.nodes_to_remove.extend(series_nodes)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py
deleted file mode 100644
index 2d4cc73a9dcb1c8d31d778b380bd0e8a13f454e9..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionPVTAttention(Fusion):
- """
- Fuse FusionPVTAttention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQkvCrossToContext_IxRT",
- ["Softmax"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = False
- self.hidden_size_warning = False
-
-
- def create_decoder_attention_node(
- self, inputs: str, outputs: str, type_mask: int, has_mask: int,scale: float
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
-
- attention_node_name = self.model.create_node_name("cross_Attention")
- attention_node = helper.make_node(
- "CustomQkvCrossToContext_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("scale", scale)])
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)])
-
- return attention_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- """
- path:
-
- (query) ---------------->MatMul ---->Mul --->softmax --->MatMul--->
- / /
- (key) ---->Transpose --> /
- /
- /
- /
- (value)--------------------------------------------->
-
- """
-
- start_node = node
- qkv_paths = {
- "path": (["Mul", "MatMul", "Transpose"], [0, 0, 0]), # cross attention qery pass
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
- next_nodes = self.model.get_children(node)
- if len(next_nodes) == 0:
- return
-
- if next_nodes[0].op_type != "MatMul":
- return
-
- second_matmul_node = next_nodes[0]
- attention_outputs = second_matmul_node.output
- remove_nodes = [second_matmul_node, node]
-
-
-
- (mul_node, first_matmul_node, transpose_node) = qkv_nodes
- transpose_nodes = self.model.get_parents(first_matmul_node)
-
- q_input = transpose_nodes[0].output[0]
- k_input = transpose_nodes[1].input[0]
- v_input = second_matmul_node.input[1]
- attention_inputs = [q_input, k_input, v_input]
- remove_nodes.extend([first_matmul_node, mul_node, transpose_nodes[1]])
-
- has_mask = 0
- type_mask = 4
-
- scale = numpy_helper.to_array(self.model.get_initializer(mul_node.input[1])).item()
- atten_node = self.create_decoder_attention_node(
- attention_inputs, attention_outputs, type_mask, has_mask,scale
- )
- self.nodes_to_add.append(atten_node)
- self.node_name_to_graph_name[atten_node.name] = self.this_graph_name
- self.nodes_to_remove.extend(remove_nodes)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
deleted file mode 100644
index a3e31fe7dd164b86cf9e6f4e476bc0b31246e747..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
+++ /dev/null
@@ -1,643 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import List, Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_attention import AttentionMask
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-def get_tensor_attr(attrs, attr_name):
- result = None
- for i in attrs:
- if i.name == attr_name:
- return numpy_helper.to_array(i.t)
- return result
-
-
-class FusionAlbertAttention(Fusion):
- """
- Fuse Albert subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- hidden_size: int,
- num_heads: int,
- attention_mask: AttentionMask,
- ):
- super().__init__(
- model,
- "CustomQKVToContextPluginDynamic_IxRT",
- ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
- )
- self.hidden_size = hidden_size
- self.num_heads = num_heads
- self.attention_mask = attention_mask
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- q_shape_value = self.model.get_constant_value(reshape_q.input[1])
- if q_shape_value is None:
- logger.debug(f"{reshape_q.input[1]} is not initializer.")
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
- logger.debug(
- f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- num_heads = q_shape_value[2]
- head_size = q_shape_value[3]
- hidden_size = num_heads * head_size
-
- if self.num_heads > 0 and num_heads != self.num_heads:
- if self.num_heads_warning:
- logger.warning(
- f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
- )
- self.num_heads_warning = False # Do not show the warning more than once
-
- if self.hidden_size > 0 and hidden_size != self.hidden_size:
- if self.hidden_size_warning:
- logger.warning(
- f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
- )
- self.hidden_size_warning = (
- False # Do not show the warning more than once
- )
-
- return num_heads, hidden_size
-
- def get_add_qk_str(self, add_qk: NodeProto):
- shape_infer = self.model.infer_runtime_shape(update=True)
- if shape_infer is None:
- return
-
- input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
- input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
-
- if input_0_shape is None or input_1_shape is None:
- logger.debug(f"one of the inputs of {add_qk} is None")
- return None
-
- if input_0_shape != input_1_shape:
- logger.debug(f"the shape of two inputs of {add_qk} is not same")
- return None
-
- return add_qk.input[1]
-
- def create_attention_node(
- self,
- mask_index: str,
- q_matmul: NodeProto,
- k_matmul: NodeProto,
- v_matmul: NodeProto,
- q_add: NodeProto,
- k_add: NodeProto,
- v_add: NodeProto,
- num_heads: int,
- hidden_size: int,
- input: str,
- output: str,
- add_qk_str: str,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- mask_index (str): mask input
- q_matmul (NodeProto): MatMul node in fully connection for Q
- k_matmul (NodeProto): MatMul node in fully connection for K
- v_matmul (NodeProto): MatMul node in fully connection for V
- q_add (NodeProto): Add bias node in fully connection for Q
- k_add (NodeProto): Add bias node in fully connection for K
- v_add (NodeProto): Add bias node in fully connection for V
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- q_weight = self.model.get_initializer(q_matmul.input[1])
- k_weight = self.model.get_initializer(k_matmul.input[1])
- v_weight = self.model.get_initializer(v_matmul.input[1])
- q_bias = self.model.get_initializer(
- q_add.input[1]
- ) or self.model.get_initializer(q_add.input[0])
- k_bias = self.model.get_initializer(
- k_add.input[1]
- ) or self.model.get_initializer(k_add.input[0])
- v_bias = self.model.get_initializer(
- v_add.input[1]
- ) or self.model.get_initializer(v_add.input[0])
-
- if q_weight is None:
- print(
- f"{q_matmul.input[1]} is not an initializer. "
- "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
- )
- return None
- if not (k_weight and v_weight and q_bias and k_bias):
- return None
-
- qw = NumpyHelper.to_array(q_weight)
- kw = NumpyHelper.to_array(k_weight)
- vw = NumpyHelper.to_array(v_weight)
-
- # assert q and k have same shape as expected
- assert qw.shape == kw.shape
-
- qw_in_size = qw.shape[0]
- kw_in_size = kw.shape[0]
- vw_in_size = vw.shape[0]
-
- assert qw_in_size == kw_in_size == vw_in_size
-
- if hidden_size > 0 and hidden_size != qw_in_size:
- logger.warning(
- f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
- "Please provide a correct input hidden size or pass in 0"
- )
-
- is_qkv_diff_dims = False
-
- # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
- # For 2d weights, the shapes would be [in_size, out_size].
- # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
- qw_out_size = np.prod(qw.shape[1:])
- kw_out_size = np.prod(kw.shape[1:])
- vw_out_size = np.prod(vw.shape[1:])
-
- qkv_weight_dim = 0
- qkv_weight = np.concatenate((qw, kw, vw), axis=1)
- qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
-
- qb = NumpyHelper.to_array(q_bias)
- kb = NumpyHelper.to_array(k_bias)
- vb = NumpyHelper.to_array(v_bias)
-
- q_bias_shape = np.prod(qb.shape)
- k_bias_shape = np.prod(kb.shape)
- v_bias_shape = np.prod(vb.shape)
-
- assert q_bias_shape == k_bias_shape == qw_out_size
- assert v_bias_shape == vw_out_size
-
- qkv_bias_dim = 0
- if is_qkv_diff_dims:
- qkv_bias = np.concatenate((qb, kb, vb), axis=0)
- qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
- else:
- qkv_bias = np.stack((qb, kb, vb), axis=0)
- qkv_bias_dim = 3 * q_bias_shape
-
- attention_node_name = self.model.create_node_name("Attention")
-
- weight = helper.make_tensor(
- name=attention_node_name + "_qkv_weight",
- data_type=TensorProto.FLOAT,
- dims=[qkv_weight_dim, qw_in_size],
- vals=qkv_weight.transpose(1, 0).flatten().tolist(),
- )
-
- # Sometimes weights and bias are stored in fp16
- if q_weight.data_type == 10:
- weight.CopyFrom(
- numpy_helper.from_array(
- NumpyHelper.to_array(weight).astype(np.float16), weight.name
- )
- )
- self.model.add_initializer(weight, self.this_graph_name)
-
- bias = helper.make_tensor(
- name=attention_node_name + "_qkv_bias",
- data_type=TensorProto.FLOAT,
- dims=[qkv_bias_dim],
- vals=qkv_bias.flatten().tolist(),
- )
- if q_bias.data_type == 10:
- bias.CopyFrom(
- numpy_helper.from_array(
- NumpyHelper.to_array(bias).astype(np.float16), bias.name
- )
- )
- self.model.add_initializer(bias, self.this_graph_name)
-
- fc_output_tensor = helper.make_tensor_value_info(
- attention_node_name + "_input", TensorProto.FLOAT, [None, None, None]
- )
- fc_node = helper.make_node(
- "CustomFCPluginDynamic_IxRT",
- inputs=[input],
- outputs=[fc_output_tensor.name],
- name=self.model.create_node_name("AttentionFC", "MatMul_AddBias_"),
- )
- fc_node.domain = "com.iluvatar"
- b = NumpyHelper.to_array(bias)
- fc_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
- fc_node.attribute.extend([helper.make_attribute("type_id", 2)])
- fc_node.attribute.extend([helper.make_attribute("W", weight)])
- fc_node.attribute.extend([helper.make_attribute("B", bias)])
- fc_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fc_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fc_node.attribute.extend([helper.make_attribute("act_type", -1)])
- self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
- self.nodes_to_add.append(fc_node)
-
- attention_inputs = [fc_node.output[0]]
- if mask_index is not None:
- attention_inputs.append(mask_index)
- else:
- attention_inputs.append("")
-
- if add_qk_str is not None:
- attention_inputs.append("")
- attention_inputs.append(add_qk_str)
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-
- if is_qkv_diff_dims:
- attention_node.attribute.extend(
- [
- helper.make_attribute(
- "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
- )
- ]
- )
-
- return attention_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
- if normalize_node.op_type == "LayerNormalization":
- add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
- if add_before_layernorm is not None:
- start_node = add_before_layernorm
- else:
- return
-
- # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
- qkv_nodes = self.model.match_parent_path(
- start_node,
- ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
- [None, None, 0, 0, 0],
- )
- if qkv_nodes is None:
- qkv_nodes = self.model.match_parent_path(
- start_node,
- ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
- [1, None, 0, 0, 0],
- )
- einsum_node = None
- if qkv_nodes is not None:
- (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
- else:
- # Match Albert
- qkv_nodes = self.model.match_parent_path(
- start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
- )
- if qkv_nodes is not None:
- (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
- else:
- return
-
- other_inputs = []
- for i, input in enumerate(start_node.input):
- if input not in output_name_to_node:
- continue
-
- if input == qkv_nodes[0].output[0]:
- continue
- other_inputs.append(input)
- if len(other_inputs) != 1:
- return
-
- root_input = other_inputs[0]
- """
- Match flaubert Mask
- |
- Mul --> LayerNormalization --> Attention --> MatMul --> Add
- | |
- | |
- +---------------------------------------------------------
- """
- mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
- if mul_before_layernorm is not None:
- mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
- if mul_children is not None and len(mul_children) == 2:
- layernorm_node = mul_children[1]
- if layernorm_node.op_type == "LayerNormalization":
- root_input = layernorm_node.output[0]
- else:
- return
- elif mul_children is not None and len(mul_children) == 5:
- root_input = mul_before_layernorm.output[0]
- else:
- return
- elif normalize_node.op_type == "LayerNormalization":
- children = input_name_to_nodes[root_input]
- for child in children:
- if child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- children = input_name_to_nodes[root_input]
- children_types = [child.op_type for child in children]
- if children_types.count("MatMul") != 3:
- return
-
- v_nodes = self.model.match_parent_path(
- matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
- )
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
- (_, _, add_v, matmul_v) = v_nodes
-
- is_distill = False
- is_distill_add = False
- is_mul_split = False
- qk_paths = {
- "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
- "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
- "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
- "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
- "path5": (["Softmax", "Add", "MatMul"], [0, 0, None])
- }
-
- qk_nodes = None
- for k, v in qk_paths.items():
- qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
- if qk_nodes is None:
- continue
- if k == "path3":
- is_distill = True
- if k == "path4":
- is_distill_add = True
- if k == "path5":
- is_mul_split = True
- break
-
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return
- add_qk = None
- matmul_qk = None
- where_qk = None
- if is_distill:
- (_, where_qk, matmul_qk, _) = qk_nodes
- elif is_distill_add:
- (_, add_qk, where_qk, matmul_qk) = qk_nodes
- elif is_mul_split:
- (_, add_qk, matmul_qk) = qk_nodes
- else:
- (_, add_qk, _, matmul_qk) = qk_nodes
-
- q_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
- )
- if q_nodes is None:
- q_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Div", "Transpose", "Reshape", "Add", "MatMul"],
- [0, 0, 0, 0, None],
- )
- if q_nodes is None and is_mul_split:
- q_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Mul", "Transpose", "Reshape", "Add", "MatMul"],
- [0, 0, 0, 0, None],
- )
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
- reshape_q = q_nodes[-3]
- add_q = q_nodes[-2]
- matmul_q = q_nodes[-1]
-
- k_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
- )
- if k_nodes is None:
- k_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
- [1, 0, 0, 0, None],
- )
- if k_nodes is None and is_mul_split:
- k_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Mul", "Transpose", "Reshape", "Add", "MatMul"],
- [1, 0, 0, 0, None],
- )
-
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
- add_k = k_nodes[-2]
- matmul_k = k_nodes[-1]
-
- # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
- mask_nodes = None
- add_qk_str = None
- if is_distill:
- _, mask_nodes, _ = self.model.match_parent_paths(
- where_qk,
- [
- (["Expand", "Reshape", "Equal"], [0, 0, 0]),
- (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
- (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
- ],
- output_name_to_node,
- )
- elif is_distill_add:
- _, mask_nodes, _ = self.model.match_parent_paths(
- where_qk,
- [
- (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
- (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
- ],
- output_name_to_node,
- )
- if add_qk is not None:
- add_qk_str = self.get_add_qk_str(add_qk)
- if add_qk_str is None:
- logger.debug(
- f"fuse_attention: failed to verify shape inference of {add_qk}"
- )
- return
- elif is_mul_split:
- _, mask_nodes, _ = self.model.match_parent_paths(
- add_qk,
- [
- (["Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze"], [None, 0, 0, 1, 0, 0])
- ],
- output_name_to_node,
- )
- else:
- _, mask_nodes, _ = self.model.match_parent_paths(
- add_qk,
- [
- (
- ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
- [None, 0, 1, 0, 0],
- ),
- (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
- (["Mul", "Sub", "Cast", "Unsqueeze"], [None, 0, 1, 0]),
- ],
- output_name_to_node,
- )
- if mask_nodes is None:
- logger.debug("fuse_attention: failed to match mask path")
- return
-
- if (
- matmul_v.input[0] == root_input
- and matmul_q.input[0] == root_input
- and matmul_k.input[0] == root_input
- ):
- # mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
- if mask_nodes[0].op_type == "Mul":
- mask_val = self.model.get_initializer(mask_nodes[0].input[1])
- if mask_val is not None:
- mask_val_arr = NumpyHelper.to_array(mask_val)
- mask_val_arr = np.where(mask_val_arr <= -100, -100, 0.0).astype(
- np.float32
- )
- mask_val.CopyFrom(
- numpy_helper.from_array(mask_val_arr, mask_val.name)
- )
- mask_index = mask_nodes[0].output[0]
-
- attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
-
- q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
- # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
- # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
- new_node = self.create_attention_node(
- mask_index,
- matmul_q,
- matmul_k,
- matmul_v,
- add_q,
- add_k,
- add_v,
- q_num_heads,
- q_hidden_size,
- root_input,
- attention_last_node.output[0],
- add_qk_str,
- )
- if new_node is None:
- return
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- if einsum_node is not None:
- unique_index = einsum_node.input[0]
- new_edge = "edge_modified_" + unique_index
- shape_tensor = helper.make_tensor(
- name="shape_modified_tensor" + unique_index,
- data_type=TensorProto.INT64,
- dims=[4],
- vals=np.int64(
- [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
- ).tobytes(),
- raw=True,
- )
- self.model.add_initializer(shape_tensor, self.this_graph_name)
- self.model.add_node(
- helper.make_node(
- "Reshape",
- [attention_last_node.output[0], shape_tensor.name],
- [new_edge],
- "reshape_modified_" + unique_index,
- ),
- self.this_graph_name,
- )
- einsum_node.input[0] = new_edge
-
- self.nodes_to_remove.extend(
- [attention_last_node, transpose_qkv, matmul_qkv]
- )
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes)
- self.nodes_to_remove.extend(k_nodes)
- self.nodes_to_remove.extend(v_nodes)
-
- # Use prune graph to remove mask nodes since they are shared by all attention nodes.
- # self.nodes_to_remove.extend(mask_nodes)
- self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
deleted file mode 100644
index 38ddf62986b46b350cdf158eeccfcf1e3602fe0c..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
+++ /dev/null
@@ -1,634 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class AttentionMask:
- """
- Fuse Attention subgraph into one Attention node.
- """
-
- def __init__(self, model: OnnxModel):
- self.model = model
- # A lookup table with mask input as key, and mask index output as value
- self.mask_indice = {}
- # A lookup table with mask input as key, and cast (to int32) output as value
- self.mask_casted = {}
- self.utils = FusionUtils(model)
- self.mask_format = AttentionMaskFormat.MaskIndexEnd
-
- def set_mask_format(self, mask_format: AttentionMaskFormat):
- self.mask_format = mask_format
-
- def set_mask_indice(self, mask, mask_index):
- if mask in self.mask_indice:
- assert mask_index == self.mask_indice[mask]
- self.mask_indice[mask] = mask_index
-
- def get_first_mask(self):
- assert len(self.mask_indice) > 0
- return next(iter(self.mask_indice))
-
- def process_mask(self, input: str) -> str:
- if self.mask_format == AttentionMaskFormat.NoMask:
- return None
-
- if input in self.mask_indice:
- return self.mask_indice[input]
-
- # Add cast to convert int64 to int32
- if self.model.find_graph_input(input):
- casted, input_name = self.utils.cast_graph_input_to_int32(input)
- else:
- input_name, cast_node = self.utils.cast_input_to_int32(input)
- casted = True
-
- if casted:
- self.mask_casted[input] = input_name
-
- # Attention supports int32 attention mask (2D) since 1.4.0
- if self.mask_format == AttentionMaskFormat.AttentionMask:
- self.mask_indice[input] = input_name
- return input_name
-
- # Add a mask processing node to convert attention mask to mask index (1D)
- output_name = self.model.create_node_name("mask_index")
- mask_index_node = helper.make_node(
- "ReduceSum",
- inputs=[input_name],
- outputs=[output_name],
- name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
- )
- mask_index_node.attribute.extend(
- [helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)]
- )
- self.model.add_node(mask_index_node)
-
- self.mask_indice[input] = output_name
- return output_name
-
-
-class FusionAttention(Fusion):
- """
- Fuse Attention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- hidden_size: int,
- num_heads: int,
- attention_mask: AttentionMask,
- ):
- super().__init__(
- model, "Attention", ["SkipLayerNormalization", "LayerNormalization"]
- )
- self.hidden_size = hidden_size
- self.num_heads = num_heads
- self.attention_mask = attention_mask
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- q_shape = self.model.get_initializer(reshape_q.input[1])
- if q_shape is None:
- logger.debug(f"{reshape_q.input[1]} is not initializer.")
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- q_shape_value = NumpyHelper.to_array(q_shape)
- if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
- logger.debug(
- f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- num_heads = q_shape_value[2]
- head_size = q_shape_value[3]
- hidden_size = num_heads * head_size
-
- if self.num_heads > 0 and num_heads != self.num_heads:
- if self.num_heads_warning:
- logger.warning(
- f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
- )
- self.num_heads_warning = False # Do not show the warning more than once
-
- if self.hidden_size > 0 and hidden_size != self.hidden_size:
- if self.hidden_size_warning:
- logger.warning(
- f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
- )
- self.hidden_size_warning = (
- False # Do not show the warning more than once
- )
-
- return num_heads, hidden_size
-
- def get_add_qk_str(self, add_qk: NodeProto):
- shape_infer = self.model.infer_runtime_shape(update=True)
- if shape_infer is None:
- return
-
- input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
- input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
-
- if input_0_shape is None or input_1_shape is None:
- logger.debug(f"one of the inputs of {add_qk} is None")
- return None
-
- if input_0_shape != input_1_shape:
- logger.debug(f"the shape of two inputs of {add_qk} is not same")
- return None
-
- return add_qk.input[1]
-
- def create_attention_node(
- self,
- mask_index: str,
- q_matmul: NodeProto,
- k_matmul: NodeProto,
- v_matmul: NodeProto,
- q_add: NodeProto,
- k_add: NodeProto,
- v_add: NodeProto,
- num_heads: int,
- hidden_size: int,
- input: str,
- output: str,
- add_qk_str: str,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- mask_index (str): mask input
- q_matmul (NodeProto): MatMul node in fully connection for Q
- k_matmul (NodeProto): MatMul node in fully connection for K
- v_matmul (NodeProto): MatMul node in fully connection for V
- q_add (NodeProto): Add bias node in fully connection for Q
- k_add (NodeProto): Add bias node in fully connection for K
- v_add (NodeProto): Add bias node in fully connection for V
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- q_weight = self.model.get_initializer(q_matmul.input[1])
- k_weight = self.model.get_initializer(k_matmul.input[1])
- v_weight = self.model.get_initializer(v_matmul.input[1])
- q_bias = self.model.get_initializer(
- q_add.input[1]
- ) or self.model.get_initializer(q_add.input[0])
- k_bias = self.model.get_initializer(
- k_add.input[1]
- ) or self.model.get_initializer(k_add.input[0])
- v_bias = self.model.get_initializer(
- v_add.input[1]
- ) or self.model.get_initializer(v_add.input[0])
-
- if q_weight is None:
- print(
- f"{q_matmul.input[1]} is not an initializer. "
- "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
- )
- return None
- if not (k_weight and v_weight and q_bias and k_bias):
- return None
-
- qw = NumpyHelper.to_array(q_weight)
- kw = NumpyHelper.to_array(k_weight)
- vw = NumpyHelper.to_array(v_weight)
-
- # assert q and k have same shape as expected
- assert qw.shape == kw.shape
-
- qw_in_size = qw.shape[0]
- kw_in_size = kw.shape[0]
- vw_in_size = vw.shape[0]
-
- assert qw_in_size == kw_in_size == vw_in_size
-
- if hidden_size > 0 and hidden_size != qw_in_size:
- logger.warning(
- f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
- "Please provide a correct input hidden size or pass in 0"
- )
-
- is_qkv_diff_dims = False
- if qw.shape != vw.shape:
- is_qkv_diff_dims = True
-
- # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
- # For 2d weights, the shapes would be [in_size, out_size].
- # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
- qw_out_size = np.prod(qw.shape[1:])
- kw_out_size = np.prod(kw.shape[1:])
- vw_out_size = np.prod(vw.shape[1:])
-
- qkv_weight_dim = 0
- if is_qkv_diff_dims:
- qkv_weight = np.concatenate((qw, kw, vw), axis=1)
- qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
- else:
- qkv_weight = np.stack((qw, kw, vw), axis=1)
- qkv_weight_dim = 3 * qw_out_size
-
- qb = NumpyHelper.to_array(q_bias)
- kb = NumpyHelper.to_array(k_bias)
- vb = NumpyHelper.to_array(v_bias)
-
- q_bias_shape = np.prod(qb.shape)
- k_bias_shape = np.prod(kb.shape)
- v_bias_shape = np.prod(vb.shape)
-
- assert q_bias_shape == k_bias_shape == qw_out_size
- assert v_bias_shape == vw_out_size
-
- qkv_bias_dim = 0
- if is_qkv_diff_dims:
- qkv_bias = np.concatenate((qb, kb, vb), axis=0)
- qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
- else:
- qkv_bias = np.stack((qb, kb, vb), axis=0)
- qkv_bias_dim = 3 * q_bias_shape
-
- attention_node_name = self.model.create_node_name("Attention")
-
- weight = helper.make_tensor(
- name=attention_node_name + "_qkv_weight",
- data_type=TensorProto.FLOAT,
- dims=[qw_in_size, qkv_weight_dim],
- vals=qkv_weight.flatten().tolist(),
- )
-
- # Sometimes weights and bias are stored in fp16
- if q_weight.data_type == 10:
- weight.CopyFrom(
- numpy_helper.from_array(
- NumpyHelper.to_array(weight).astype(np.float16), weight.name
- )
- )
- self.model.add_initializer(weight, self.this_graph_name)
-
- bias = helper.make_tensor(
- name=attention_node_name + "_qkv_bias",
- data_type=TensorProto.FLOAT,
- dims=[qkv_bias_dim],
- vals=qkv_bias.flatten().tolist(),
- )
- if q_bias.data_type == 10:
- bias.CopyFrom(
- numpy_helper.from_array(
- NumpyHelper.to_array(bias).astype(np.float16), bias.name
- )
- )
- self.model.add_initializer(bias, self.this_graph_name)
-
- attention_inputs = [
- input,
- attention_node_name + "_qkv_weight",
- attention_node_name + "_qkv_bias",
- ]
- if mask_index is not None:
- attention_inputs.append(mask_index)
- else:
- attention_inputs.append("")
-
- if add_qk_str is not None:
- attention_inputs.append("")
- attention_inputs.append(add_qk_str)
-
- attention_node = helper.make_node(
- "Attention",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-
- if is_qkv_diff_dims:
- attention_node.attribute.extend(
- [
- helper.make_attribute(
- "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
- )
- ]
- )
-
- return attention_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
- if normalize_node.op_type == "LayerNormalization":
- add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
- if add_before_layernorm is not None:
- start_node = add_before_layernorm
- else:
- return
-
- # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
- qkv_nodes = self.model.match_parent_path(
- start_node,
- ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
- [None, None, 0, 0, 0],
- )
- einsum_node = None
- if qkv_nodes is not None:
- (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
- else:
- # Match Albert
- qkv_nodes = self.model.match_parent_path(
- start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
- )
- if qkv_nodes is not None:
- (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
- else:
- return
-
- other_inputs = []
- for i, input in enumerate(start_node.input):
- if input not in output_name_to_node:
- continue
-
- if input == qkv_nodes[0].output[0]:
- continue
- other_inputs.append(input)
- if len(other_inputs) != 1:
- return
-
- root_input = other_inputs[0]
- """
- Match flaubert Mask
- |
- Mul --> LayerNormalization --> Attention --> MatMul --> Add
- | |
- | |
- +---------------------------------------------------------
- """
- mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
- if mul_before_layernorm is not None:
- mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
- if mul_children is not None and len(mul_children) == 2:
- layernorm_node = mul_children[1]
- if layernorm_node.op_type == "LayerNormalization":
- root_input = layernorm_node.output[0]
- else:
- return
- elif mul_children is not None and len(mul_children) == 5:
- root_input = mul_before_layernorm.output[0]
- else:
- return
- elif normalize_node.op_type == "LayerNormalization":
- children = input_name_to_nodes[root_input]
- for child in children:
- if child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- children = input_name_to_nodes[root_input]
- children_types = [child.op_type for child in children]
- if children_types.count("MatMul") != 3:
- return
-
- v_nodes = self.model.match_parent_path(
- matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
- )
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
- (_, _, add_v, matmul_v) = v_nodes
-
- is_distill = False
- is_distill_add = False
- qk_paths = {
- "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
- "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
- "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
- "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
- }
-
- qk_nodes = None
- for k, v in qk_paths.items():
- qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
- if qk_nodes is None:
- continue
- if k == "path3":
- is_distill = True
- if k == "path4":
- is_distill_add = True
- break
-
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return
-
- add_qk = None
- matmul_qk = None
- where_qk = None
- if is_distill:
- (_, where_qk, matmul_qk, _) = qk_nodes
- elif is_distill_add:
- (_, add_qk, where_qk, matmul_qk) = qk_nodes
- else:
- (_, add_qk, _, matmul_qk) = qk_nodes
-
- q_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
- )
- if q_nodes is None:
- q_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Div", "Transpose", "Reshape", "Add", "MatMul"],
- [0, 0, 0, 0, None],
- )
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
- reshape_q = q_nodes[-3]
- add_q = q_nodes[-2]
- matmul_q = q_nodes[-1]
-
- k_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
- )
- if k_nodes is None:
- k_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
- [1, 0, 0, 0, None],
- )
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
- add_k = k_nodes[-2]
- matmul_k = k_nodes[-1]
-
- # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
- mask_nodes = None
- add_qk_str = None
- if is_distill:
- _, mask_nodes, _ = self.model.match_parent_paths(
- where_qk,
- [
- (["Expand", "Reshape", "Equal"], [0, 0, 0]),
- (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
- (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
- ],
- output_name_to_node,
- )
- elif is_distill_add:
- _, mask_nodes, _ = self.model.match_parent_paths(
- where_qk,
- [
- (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
- (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
- ],
- output_name_to_node,
- )
- if add_qk is not None:
- add_qk_str = self.get_add_qk_str(add_qk)
- if add_qk_str is None:
- logger.debug(
- f"fuse_attention: failed to verify shape inference of {add_qk}"
- )
- return
- else:
- _, mask_nodes, _ = self.model.match_parent_paths(
- add_qk,
- [
- (
- ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
- [None, 0, 1, 0, 0],
- ),
- (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
- ],
- output_name_to_node,
- )
- if mask_nodes is None:
- logger.debug("fuse_attention: failed to match mask path")
- return
-
- if (
- matmul_v.input[0] == root_input
- and matmul_q.input[0] == root_input
- and matmul_k.input[0] == root_input
- ):
- mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
-
- attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
-
- q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
- # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
- # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
- new_node = self.create_attention_node(
- mask_index,
- matmul_q,
- matmul_k,
- matmul_v,
- add_q,
- add_k,
- add_v,
- q_num_heads,
- q_hidden_size,
- root_input,
- attention_last_node.output[0],
- add_qk_str,
- )
- if new_node is None:
- return
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- if einsum_node is not None:
- unique_index = einsum_node.input[0]
- new_edge = "edge_modified_" + unique_index
- shape_tensor = helper.make_tensor(
- name="shape_modified_tensor" + unique_index,
- data_type=TensorProto.INT64,
- dims=[4],
- vals=np.int64(
- [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
- ).tobytes(),
- raw=True,
- )
- self.model.add_initializer(shape_tensor, self.this_graph_name)
- self.model.add_node(
- helper.make_node(
- "Reshape",
- [attention_last_node.output[0], shape_tensor.name],
- [new_edge],
- "reshape_modified_" + unique_index,
- ),
- self.this_graph_name,
- )
- einsum_node.input[0] = new_edge
-
- self.nodes_to_remove.extend(
- [attention_last_node, transpose_qkv, matmul_qkv]
- )
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes)
- self.nodes_to_remove.extend(k_nodes)
- self.nodes_to_remove.extend(v_nodes)
-
- # Use prune graph to remove mask nodes since they are shared by all attention nodes.
- # self.nodes_to_remove.extend(mask_nodes)
- self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
deleted file mode 100644
index 3732b0f5fab40cbb269f18abdd56286f298a5493..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import List, Union
-
-from onnx import GraphProto
-
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class Fusion:
- def __init__(
- self,
- model: OnnxModel,
- fused_op_type: str,
- search_op_types: Union[str, List[str]],
- description: str = None,
- ):
- self.search_op_types: List[str] = (
- [search_op_types] if isinstance(search_op_types, str) else search_op_types
- )
- self.fused_op_type: str = fused_op_type
- self.description: str = (
- f"{fused_op_type}({description})" if description else fused_op_type
- )
- self.model: OnnxModel = model
- self.nodes_to_remove: List = []
- self.nodes_to_add: List = []
- self.prune_graph: bool = False
- self.node_name_to_graph_name: dict = {}
- self.this_graph_name: str = None
- # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter.
- self.fused_count: int = 0
-
- def apply(self):
- logger.debug(f"start {self.description} fusion...")
- input_name_to_nodes = self.model.input_name_to_nodes()
- output_name_to_node = self.model.output_name_to_node()
-
- # This assumes that two search ops will not be fused at same time!
- for search_op_type in self.search_op_types:
- for node in self.model.get_nodes_by_op_type(search_op_type):
- graph = self.model.get_graph_by_node(node)
- if graph is None:
- raise Exception("Can not find node in any graphs")
- self.this_graph_name = graph.name
- self.fuse(node, input_name_to_nodes, output_name_to_node)
-
- op_list = [node.op_type for node in self.nodes_to_add]
- count = max(self.fused_count, op_list.count(self.fused_op_type))
- if count > 0:
- logger.info(f"Fused {self.description} count: {count}")
-
- self.model.remove_nodes(self.nodes_to_remove)
- self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
-
- if self.prune_graph:
- self.model.prune_graph()
- elif self.nodes_to_remove or self.nodes_to_add:
- self.model.update_graph()
-
- def match_parent_path_from_dict(
- self, start_node, path_dict, output_name_to_node=None, return_indice=None
- ):
- res_path = None
- res_nodes = None
- for k, v in path_dict.items():
- res_nodes = self.model.match_parent_path(
- start_node,
- v[0],
- v[1],
- output_name_to_node=output_name_to_node,
- return_indice=return_indice,
- )
- if res_nodes is None:
- continue
- return res_nodes, k
- return res_nodes, res_path
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
deleted file mode 100644
index 045cd99380a7535079d0f9f33322e2879d2074c0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionBiasGelu(Fusion):
- def __init__(self, model: OnnxModel, is_fastgelu):
- if is_fastgelu:
- super().__init__(model, "FastGelu", "FastGelu", "add bias")
- else:
- super().__init__(model, "BiasGelu", "Gelu")
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- gelu_op_type = node.op_type
- fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
-
- if len(node.input) != 1:
- return
-
- nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
- if nodes is None:
- return
- (add, matmul) = nodes
-
- bias_weight = None
- # bias should be one dimension
- bias_index = -1
- for i, input in enumerate(add.input):
- initializer = self.model.get_initializer(input)
- if initializer is None:
- continue
- bias_index = i
- bias_weight = NumpyHelper.to_array(initializer)
- break
- if bias_weight is None:
- return
- if len(bias_weight.shape) != 1:
- return
-
- subgraph_nodes = [node, add]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
-
- fused_node = helper.make_node(
- fuse_op_type,
- inputs=[matmul.output[0], add.input[bias_index]],
- outputs=node.output,
- name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
- )
- fused_node.domain = "com.microsoft"
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
deleted file mode 100644
index 21161727373b1ceee5362bc2fa0e713f17e899ae..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionConformerAttention(Fusion):
- """
- Fuse VideoBertAttention subgraph into one Attention node.
- """
-
- def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
- super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["Concat"])
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- self.hidden_size = hidden_size
- self.num_heads = num_heads
-
- def get_num_heads_and_hidden_size(
- self, atten_matmul: NodeProto, div: NodeProto
- ) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
- div_initializer = self.model.get_initializer(div.input[1])
-
- # 检查float_data是否为空
- if len(div_initializer.float_data) > 0:
- div_value = div_initializer.float_data[0]
- else:
- # 如果float_data为空,尝试其他方式获取数据
- # 例如,如果数据存储在raw_data中
- if len(div_initializer.raw_data) > 0:
- dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
- div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
- else:
- raise ValueError("Data not found in the div_initializer")
-
- atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
- head_dim = math.ceil(div_value * div_value)
- hidden_size = atten_matul_shape_value[0]
- num_heads = hidden_size // head_dim
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self, num_heads: int, hidden_size: int, inputs: str, outputs: str
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- attention_node_name = self.model.create_node_name("Attention")
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-
- return attention_node
-
- def fuse_reshape(self, shape_data_name):
-
- shape_tensor = helper.make_tensor(
- name=shape_data_name,
- data_type=TensorProto.INT64,
- dims=[3],
- vals=np.int64([128, -1, self.hidden_size // self.num_heads]).tobytes(),
- raw=True,
- )
- self.model.add_initializer(shape_tensor, self.this_graph_name)
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
-
- paths = {
- "path": (
- ["Unsqueeze", "Mul", "Gather", "Shape", "LayerNormalization"],
- [None, None, None, None, None],
- ),
- }
-
- reshape_nodes, reshape_path = self.match_parent_path_from_dict(
- start_node, paths
- )
- if reshape_nodes is None:
- return
-
- self.nodes_to_remove.append(start_node)
-
- self.nodes_to_remove.extend(reshape_nodes[:-1])
- self.fuse_reshape(start_node.output[0])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
deleted file mode 100644
index b55c2412b07067d3ebb05cc080be6a3a31902e22..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-import numpy as np
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionConformerXSoftmax(Fusion):
- """
- Fuse Where + Softmax + Where into one node: XSoftmax
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(model, "XSoftmax_IxRT", "Softmax")
-
- def create_xsoftmax_node(
- self, data_input: str, mask_input: str, output: str
- ) -> Union[NodeProto, None]:
- """Create an XSoftmax node.
-
- Args:
- data_input (str): data input name
- mask_input (str): max input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
-
- unique_index = data_input
- new_edge = "edge_modified_" + unique_index
- shape_tensor = helper.make_tensor(
- name="shape_modified_tensor_" + unique_index,
- data_type=TensorProto.INT64,
- dims=[4],
- vals=np.int64(
- [-1, 8, 128, 128] # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
- ).tobytes(),
- raw=True,
- )
- self.model.add_initializer(shape_tensor, self.this_graph_name)
- self.model.add_node(
- helper.make_node(
- "Reshape",
- [data_input, shape_tensor.name],
- [new_edge],
- "reshape_modified_" + unique_index,
- ),
- self.this_graph_name,
- )
-
- new_edge2 = "edge_modified2_" + unique_index
- xsoftmax_node_name = self.model.create_node_name("XSoftmax")
-
- xsoftmax_node = helper.make_node(
- "XSoftmax_IxRT",
- inputs=[new_edge, mask_input],
- outputs=[new_edge2],
- name=xsoftmax_node_name,
- )
- xsoftmax_node.domain = "com.iluvatar"
- xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
- xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
- xsoftmax_node.attribute.extend([helper.make_attribute("is_conformer", 1)])
-
- shape_tensor2 = helper.make_tensor(
- name="shape_modified_tensor2_" + unique_index,
- data_type=TensorProto.INT64,
- dims=[3],
- vals=np.int64(
- [-1, 128, 128] # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
- ).tobytes(),
- raw=True,
- )
- self.model.add_initializer(shape_tensor2, self.this_graph_name)
- self.model.add_node(
- helper.make_node(
- "Reshape",
- [new_edge2, shape_tensor2.name],
- [output],
- "reshape_modified2_" + unique_index,
- ),
- self.this_graph_name,
- )
-
- return xsoftmax_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- xsoftmax_paths = {
- "path": (["Add", "Where", "Reshape", "Expand"], [None, None, None, None]),
- }
- xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
- node, xsoftmax_paths
- )
-
- if xsoftmax_nodes is None:
- logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
- return
- else:
- (add_node, where_node, reshape_node, expand_node) = xsoftmax_nodes
-
- mask_input = expand_node.input[0]
-
- data_output = node.output[0]
-
- data_input = add_node.input[0]
- if where_node.output[0] == add_node.input[0]:
- data_input = add_node.input[1]
- xsoftmax_node = self.create_xsoftmax_node(
- data_input, mask_input, data_output
- )
-
- self.nodes_to_remove.extend(xsoftmax_nodes)
- self.nodes_to_add.append(xsoftmax_node)
- self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py
deleted file mode 100644
index 23cdd0c2d0dca61bf66eb1f484e3093f4d7bf0c6..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionConvReformat(Fusion):
- """
- Fuse FusionPVTAttention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "FuseConvReformat_IxRT",
- ["Transpose"],
- )
-
-
-
- def create_fuse_node(
- self, inputs: str, outputs: str, before_conv: int, shape_data: list, prefix
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
-
- node_name = self.model.create_node_name(f"FuseConvReformat_{prefix}")
- node = helper.make_node(
- "FuseConvReformat_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=node_name,
- )
- node.domain = "com.iluvatar"
-
- node.attribute.extend([helper.make_attribute("before_conv", before_conv)])
- node.attribute.extend([helper.make_attribute("shape_data", shape_data)])
- node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- return node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- """
- eliminate Transpose(linear->nchw) + Transpose
- path:
- ----->Transpose ---->Reshape---> conv ----->Reshape ---->Transpose--->
-
- to:
- ----->FuseConvReformat_IxRT---> conv ----->FuseConvReformat_IxRT--->
-
- """
- start_node = node
- paths = {
- "path": (["Reshape", "Conv", "Reshape","Transpose"], [0, 0, 0, 0]), # cross attention qery pass
- }
-
- nodes, path = self.match_parent_path_from_dict(start_node, paths)
-
- if nodes is None:
- logger.debug("FuseConvReformat: failed to match path")
- return
-
- (reshape_after_node, conv_node, reshape_before_node, tranpose_before_node) = nodes
-
- perm1 = tranpose_before_node.attribute[0].ints
- if perm1 !=[0, 2, 1]:
- return
- perm2 = start_node.attribute[0].ints
- if perm2 !=[0, 2, 1]:
- return
-
- before_shape_data = numpy_helper.to_array(self.model.get_initializer(reshape_before_node.input[1]))
-
- if before_shape_data.shape[0] != 4:
- return
-
- after_shape_data = numpy_helper.to_array(self.model.get_initializer(reshape_after_node.input[1]))
- if after_shape_data.shape[0] != 3:
- return
- node1_inputs = tranpose_before_node.input
- node1_outputs = reshape_before_node.output
- node1_before_conv = 1
-
- new_node1 = self.create_fuse_node(
- node1_inputs, node1_outputs, node1_before_conv, before_shape_data,"before")
-
-
- node2_inputs = conv_node.output
- node2_outputs = start_node.output
- node2_before_conv = 0
- new_node2 = self.create_fuse_node(
- node2_inputs, node2_outputs, node2_before_conv, after_shape_data,"after")
-
- self.nodes_to_add.append(new_node1)
- self.nodes_to_add.append(new_node2)
- self.node_name_to_graph_name[new_node1.name] = self.this_graph_name
- self.node_name_to_graph_name[new_node2.name] = self.this_graph_name
- self.nodes_to_remove.extend([start_node, reshape_after_node,reshape_before_node,tranpose_before_node])
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py
deleted file mode 100644
index 5bfa8768e7077fad40b9ef8ff51427db217a5069..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-
-class FusionCosyvoiceAttention(Fusion):
- """
- Fuse T5Attention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQkvCrossToContext_IxRT",
- ["Softmax"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- q_shape = self.model.get_initializer(reshape_q.input[1])
- if q_shape is None:
- logger.debug(f"{reshape_q.input[1]} is not initializer.")
- return [0, 0]
-
- q_shape_value = NumpyHelper.to_array(q_shape)
- if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
- logger.debug(
- f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return [0, 0]
-
- num_heads = q_shape_value[2]
- head_size = q_shape_value[3]
- hidden_size = num_heads * head_size
-
- return num_heads, hidden_size
-
- def create_decoder_attention_node(
- self, inputs: str, outputs: str, type_mask: int, has_mask: int, scale: float
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
-
- attention_node_name = self.model.create_node_name("decoder_Attention")
- attention_node = helper.make_node(
- "CustomQkvCrossToContext_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("scale", scale)])
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)])
-
- return attention_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- """
- path1:
-
- (query) --------------MatMul---Div --> add -->softmax --->MatMul--->
- / / /
- (key) ---->Transpose > / /
- / /
- (mask) ------------------------> /
- /
- (value)--------------------------------------------->
- """
-
-
-
-
- import pdb
- start_node = node
- qkv_paths = {
- "path1": (
- ["Add", "Div", "MatMul", "Transpose"],
- [None, 0, None, 1],
- ), # float mask self attention,self attention key pass
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
- next_nodes = self.model.get_children(node)
-
- if len(next_nodes) == 0:
- return
-
- if next_nodes[0].op_type != "MatMul":
- return
-
- second_matmul_node = next_nodes[0]
- attention_inputs = None
- attention_outputs = second_matmul_node.output
- remove_nodes = [second_matmul_node, node]
-
- (add_node, div_node, first_matmul_node, transpose_node) = qkv_nodes
- transpose_nodes = self.model.get_parents(first_matmul_node)
- q_input = transpose_nodes[0].output[0]
-
- k_transpose_node = transpose_nodes[1]
- k_transpose_node_perm = k_transpose_node.attribute[0].ints
-
- if k_transpose_node_perm == [0, 2, 3, 1]: #transpose has bean merge,[0,2,1,3]->[0, 1, 3, 2] = [0, 2, 3, 1]
- k_input = transpose_nodes[1].output[0]
-
- transpose_nodes[1].attribute[0].ints[0] = 0
- transpose_nodes[1].attribute[0].ints[1] = 2
- transpose_nodes[1].attribute[0].ints[2] = 1
- transpose_nodes[1].attribute[0].ints[3] = 3
-
- remove_nodes.extend([add_node, div_node, first_matmul_node])
-
- elif k_transpose_node_perm == [0, 1, 3, 2]:
- k_input = transpose_nodes[1].input[0]
- remove_nodes.extend([add_node, div_node, first_matmul_node,k_transpose_node])
-
- else:
- return
-
- v_input = second_matmul_node.input[1]
- attention_inputs = [q_input, k_input, v_input]
-
- has_mask = 1
- type_mask = 3 # float mask
-
- mask_input = add_node.input[0]
- score_out = div_node.output[0]
- if add_node.input[0] == score_out:
- mask_input = add_node.input[1]
- attention_inputs.append(mask_input)
-
- scale_data = self.model.get_initializer_input_edges(div_node.name, return_np_array = True)
- scale = 1.0 / scale_data[0]
-
- atten_node = self.create_decoder_attention_node(
- attention_inputs, attention_outputs, type_mask, has_mask, scale
- )
-
- self.nodes_to_add.append(atten_node)
- self.node_name_to_graph_name[atten_node.name] = self.this_graph_name
- self.nodes_to_remove.extend(remove_nodes)
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py
deleted file mode 100755
index d1a1baffd56aba589caa4251d7d841e9715b8f02..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSplitQKV(Fusion):
- """
- Fuse FusionSplitQKV
- """
-
- def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
- super().__init__(model, "SplitQKV_IxRT", "Split")
-
- self.hidden_size = hidden_size
- self.num_heads = num_heads
-
- def create_node(
- self, inputs: list, outputs:list
- ) -> Union[NodeProto, None]:
- """Create an create node.
-
- Args:
- data_input (str): data input name
- mask_input (str): max input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- node_name = self.model.create_node_name("SplitQKV_IxRT")
-
-
- k_cache_output = outputs[1]
- v_cache_output = outputs[2]
-
- concat_k_input = k_cache_output + "_k_concat_input"
- concat_v_input = v_cache_output + "_v_concat_input"
-
- plugin_outputs = [outputs[0],concat_k_input,concat_v_input]
-
- new_node = helper.make_node(
- "SplitQKV_IxRT",
- inputs=inputs,
- outputs=plugin_outputs,
- name=node_name,
- )
- new_node.domain = "com.iluvatar"
- new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- new_node.attribute.extend(
- [helper.make_attribute("atten_scale", 1.0)]
- )
- new_node.attribute.extend(
- [helper.make_attribute("transpose", 1)]
- )
- new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)])
- new_node.attribute.extend(
- [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)]
- )
-
-
-
- k_concat_node_name = node_name + "_k_concat"
- v_concat_node_name = node_name + "_v_concat"
-
- k_concat_node = helper.make_node(
- "Identity",
- inputs=[concat_k_input],
- outputs=[outputs[1]],
- name=k_concat_node_name,
- )
-
- v_concat_node = helper.make_node(
- "Identity",
- inputs=[concat_v_input],
- outputs=[outputs[2]],
- name=v_concat_node_name,
- )
-
- self.model.replace_input_of_all_nodes(outputs[1],concat_k_input)
- self.model.replace_input_of_all_nodes(outputs[2],concat_v_input)
- return new_node,k_concat_node,v_concat_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- split_node = node
- split_data = self.model.get_initializer_input_edges(node.name,return_np_array = True)
- if split_data[0].shape != (3,):
- return
- if split_data[0][0] != split_data[0][1] and split_data[0][1] != split_data[0][2]:
- return
-
- q_input, k_input, v_input = node.output[0],node.output[1],node.output[2]
-
- q_path_nodes= []
- k_path_nodes= []
- v_path_nodes= []
-
- reshape_nodes = self.model.get_children(node)
-
- for node in reshape_nodes:
- if node.op_type != "Reshape":
- return
- q_reshape_node,k_reshape_node,v_reshape_node = reshape_nodes[0],reshape_nodes[1],reshape_nodes[2]
-
- q_path_nodes.append(q_reshape_node)
- k_path_nodes.append(k_reshape_node)
- v_path_nodes.append(v_reshape_node)
-
- q_transpose_nodes = self.model.get_children(q_reshape_node)
- k_transpose_nodes = self.model.get_children(k_reshape_node)
- v_transpose_nodes = self.model.get_children(v_reshape_node)
-
- if len(q_transpose_nodes)!=1 and (not k_transpose_nodes) and len(v_transpose_nodes) != 1:
- return
-
-
- if (q_transpose_nodes[0].attribute[0].ints != [0, 2, 1, 3]) and (v_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]):
- return
-
- if len(k_transpose_nodes) == 2:
- if (k_transpose_nodes[0].attribute[0].ints != k_transpose_nodes[1].attribute[0].ints) and (k_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]):
- return
-
-
- if len(k_transpose_nodes) == 1:
- if (k_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]):
- return
-
-
- q_transpose_node = q_transpose_nodes[0]
- k_transpose_node_0 = k_transpose_nodes[0]
- v_transpose_node = v_transpose_nodes[0]
-
- k_output = k_transpose_node_0.output[0]
-
- if len(k_transpose_nodes) == 2:
- k_transpose_node_1 = k_transpose_nodes[1]
- next_node = self.model.get_children(k_transpose_node_1)
- if not next_node:
- return
-
- self.model.replace_node_input(next_node[0], k_transpose_node_1.output[0], k_transpose_node_0.output[0])
-
-
- q_path_nodes.append(q_transpose_node)
- v_path_nodes.append(v_transpose_node)
- k_path_nodes.extend(k_transpose_nodes)
-
- plugin_inputs = [split_node.input[0]]
- plugin_outputs = [q_transpose_node.output[0], k_output,v_transpose_node.output[0]]
-
- remove_nodes = [split_node]
-
- remove_nodes.extend(q_path_nodes)
- remove_nodes.extend(k_path_nodes)
- remove_nodes.extend(v_path_nodes)
-
- new_node,k_cache_concat_node, v_cache_concat_node = self.create_node(plugin_inputs, plugin_outputs)
-
- self.nodes_to_add.append(new_node)
- self.nodes_to_add.append(k_cache_concat_node)
- self.nodes_to_add.append(v_cache_concat_node)
-
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
- self.node_name_to_graph_name[k_cache_concat_node.name] = self.this_graph_name
- self.node_name_to_graph_name[v_cache_concat_node.name] = self.this_graph_name
- self.nodes_to_remove.extend(remove_nodes)
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py
deleted file mode 100644
index 6b1599d4b27cf32c74dc9c294564490ff1e799da..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionCosyVoiceSplitQKVUpdateKVCache(Fusion):
- """
- Fuse FusionSplitQKVUpdateKVCache
- """
-
- def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
- super().__init__(
- model, "SplitQKVUpdateKVCache_IxRT", "Split"
- )
-
- self.hidden_size = hidden_size
- self.num_heads = num_heads
-
- def create_node(
- self,
- inputs: list,
- outputs: list,
- ) -> Union[NodeProto, None]:
- """Create an XSoftmax node.
-
- Args:
- data_input (str): data input name
- mask_input (str): max input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- node_name = self.model.create_node_name("SplitQKVUpdateKVCache_IxRT")
-
- k_cache_output = outputs[1]
- v_cache_output = outputs[2]
-
- concat_k_input = k_cache_output + "_k_concat_input"
- concat_v_input = v_cache_output + "_v_concat_input"
-
- plugin_outputs = [outputs[0],concat_k_input,concat_v_input]
-
- new_node = helper.make_node(
- "SplitQKVUpdateKVCache_IxRT",
- inputs=inputs,
- outputs=plugin_outputs,
- name=node_name,
- )
-
- k_concat_node_name = node_name + "_k_concat"
- v_concat_node_name = node_name + "_v_concat"
-
- k_concat_node = helper.make_node(
- "Identity",
- inputs=[concat_k_input],
- outputs=[outputs[1]],
- name=k_concat_node_name,
- )
-
-
-
- v_concat_node = helper.make_node(
- "Identity",
- inputs=[concat_v_input],
- outputs=[outputs[2]],
- name=v_concat_node_name,
- )
-
-
-
-
-
-
- new_node.domain = "com.iluvatar"
- new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)])
- new_node.attribute.extend(
- [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)]
- )
-
- self.model.replace_input_of_all_nodes(outputs[1],concat_k_input)
- self.model.replace_input_of_all_nodes(outputs[2],concat_v_input)
-
- return new_node,k_concat_node,v_concat_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- split_node = node
- split_data = self.model.get_initializer_input_edges(node.name,return_np_array = True)
- if split_data[0].shape != (3,):
- return
- if split_data[0][0] != split_data[0][1] and split_data[0][1] != split_data[0][2]:
- return
-
- q_input, k_input, v_input = node.output[0],node.output[1],node.output[2]
-
- q_path_nodes= []
- k_path_nodes= []
- v_path_nodes= []
-
- reshape_nodes = self.model.get_children(node)
-
- for node in reshape_nodes:
- if node.op_type != "Reshape":
- return
- q_reshape_node,k_reshape_node,v_reshape_node = reshape_nodes[0],reshape_nodes[1],reshape_nodes[2]
-
- q_path_nodes.append(q_reshape_node)
- k_path_nodes.append(k_reshape_node)
- v_path_nodes.append(v_reshape_node)
-
- q_transpose_nodes = self.model.get_children(q_reshape_node)
- k_transpose_nodes = self.model.get_children(k_reshape_node)
- v_transpose_nodes = self.model.get_children(v_reshape_node)
-
- if len(q_transpose_nodes)!=1 and len(k_transpose_nodes) != 1 and len(v_transpose_nodes) != 1:
- return
-
-
- q_transpose_node = q_transpose_nodes[0]
-
- k_transpose_node = k_transpose_nodes[0]
- v_transpose_node = v_transpose_nodes[0]
-
- k_path_nodes.append(k_transpose_node)
- v_path_nodes.append(v_transpose_node)
-
-
- k_concat_nodes = self.model.get_children(k_transpose_node)
- v_concat_nodes = self.model.get_children(v_transpose_node)
-
- if len(k_transpose_nodes) != 1 or len(v_transpose_nodes) != 1:
- return
-
- k_concat_node = k_concat_nodes[0]
- v_concat_node = v_concat_nodes[0]
-
- if v_concat_node.attribute[0].i != 2 and k_concat_node.attribute[0].i != 2: #axis = 2
- return
-
- k_path_nodes.append(k_concat_node)
- v_path_nodes.append(v_concat_node)
-
- k_cache_input = k_concat_node.input[0]
- if k_transpose_node.output[0] == k_concat_node.input[0]:
- k_cache_input = k_concat_node.input[1]
- k_cache_output = k_concat_node.output[0]
-
-
-
- v_cache_input = v_concat_node.input[0]
- if v_transpose_node.output[0] == v_concat_node.input[0]:
- v_cache_input = v_concat_node.input[1]
- v_cache_output = v_concat_node.output[0]
-
-
- plugin_inputs = [split_node.input[0],k_cache_input,v_cache_input]
- plugin_outputs = [q_transpose_node.output[0], k_cache_output,v_cache_output]
- remove_nodes = [split_node, q_reshape_node,q_transpose_node]
-
- remove_nodes.extend(k_path_nodes)
- remove_nodes.extend(v_path_nodes)
- new_node,k_cache_concat_node, v_cache_concat_node= self.create_node(plugin_inputs, plugin_outputs)
-
- self.nodes_to_add.append(new_node)
- self.nodes_to_add.append(k_cache_concat_node)
- self.nodes_to_add.append(v_cache_concat_node)
-
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
- self.node_name_to_graph_name[k_cache_concat_node.name] = self.this_graph_name
- self.node_name_to_graph_name[v_cache_concat_node.name] = self.this_graph_name
-
- self.nodes_to_remove.extend(remove_nodes)
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
deleted file mode 100644
index c2dd243357fac20057d67551c0d3d9d86b15dc68..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionCustomFCGPT2(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Reshape"], "gpt2")
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- nodes = self.model.match_parent_path(node, ["Gemm", "Reshape"], [0, 0])
-
- if nodes is None:
- return False
-
- (matmul, reshape_before_matmul) = nodes
-
- matmul_weight = self.model.get_initializer(matmul.input[1])
- matmul_bias = self.model.get_initializer(matmul.input[2])
-
- if matmul_weight is None or matmul_bias is None:
- return False
-
- w = NumpyHelper.to_array(matmul_weight)
- b = NumpyHelper.to_array(matmul_bias)
-
- transB = 0
- for attr in matmul.attribute:
- if attr.name == "transB":
- transB = attr.i
- break
-
- trans_matmul_weight = w
- if transB == 0:
- trans_matmul_weight = w.transpose(1, 0)
- if matmul_weight.name not in self.model.initializer_visited.keys():
- self.model.initializer_visited[matmul_weight.name] = True
- if matmul_weight.data_type == 10:
- matmul_weight.CopyFrom(
- numpy_helper.from_array(
- trans_matmul_weight.astype(np.float16), matmul_weight.name
- )
- )
- else:
- matmul_weight.CopyFrom(
- numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
- )
-
- if matmul_bias.data_type == 10:
- matmul_bias.CopyFrom(
- numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
- )
- else:
- matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
-
- fused_node = helper.make_node(
- "CustomFCPluginDynamic_IxRT",
- inputs=[reshape_before_matmul.input[0]],
- outputs=node.output,
- name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
- fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- self.nodes_to_add.append(fused_node)
- self.nodes_to_remove.extend([matmul, node, reshape_before_matmul])
-
-
-class FusionCustomFcRoformer(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"], "roformer fc")
-
- # For model Roformer.
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- if len(node.input) != 2:
- return False
-
- fc_paths = {
- "path1": (["Reshape", "MatMul", "Reshape"], [0, 0, 0]),
- "path2": (["Reshape", "MatMul", "Reshape"], [1, 0, 0]),
- }
-
- nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
- if nodes is None:
- return False
-
- reshape_after_matmul = nodes[0]
- matmul = nodes[1]
- reshape_before_matmul = nodes[2]
-
- reshape_before_shape = None
- reshape_after_shape = None
- for value_info in self.model.graph().value_info:
- if value_info.name == reshape_before_matmul.input[0]:
- reshape_before_shape = len(value_info.type.tensor_type.shape.dim)
- break
- for value_info in self.model.graph().value_info:
- if value_info.name == reshape_after_matmul.output[0]:
- reshape_after_shape = len(value_info.type.tensor_type.shape.dim)
- break
- if reshape_before_shape != reshape_after_shape:
- return False
-
- weight = self.model.get_initializer(matmul.input[1])
- bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
- node.input[0]
- )
-
- if weight is None or bias is None:
- return False
-
- w = NumpyHelper.to_array(weight)
- w_in_size = w.shape[0]
- weight_dim = np.prod(w.shape[1:])
-
- b = NumpyHelper.to_array(bias)
- bias_dim = np.prod(b.shape)
- trans_matmul_weight = w.transpose(1, 0)
- weight.CopyFrom(onnx.numpy_helper.from_array(trans_matmul_weight, weight.name))
- # Sometimes weights and bias are stored in fp16
- if weight.data_type == 10:
- weight.CopyFrom(
- numpy_helper.from_array(
- trans_matmul_weight.astype(np.float16), weight.name
- )
- )
- bias_arr = onnx.numpy_helper.to_array(bias).flatten()
- bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
- if bias.data_type == 10:
- bias.CopyFrom(
- numpy_helper.from_array(
- NumpyHelper.to_array(bias).astype(np.float16), bias.name
- )
- )
-
- fused_node = helper.make_node(
- "CustomFCPluginDynamic_IxRT",
- inputs=[reshape_before_matmul.input[0]],
- outputs=node.output,
- name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- fused_node.attribute.extend([helper.make_attribute("W", weight)])
- fused_node.attribute.extend([helper.make_attribute("B", bias)])
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- self.nodes_to_add.append(fused_node)
-
- self.nodes_to_remove.extend([node])
- self.nodes_to_remove.extend(nodes)
- return True
-
-
-class FusionCustomFC(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"])
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- if self.fuse_1(node, input_name_to_nodes, output_name_to_node):
- return
-
- def fuse_1(self, node, input_name_to_nodes, output_name_to_node):
- if len(node.input) != 2:
- return False
- nodes = self.model.match_parent_path(node, ["MatMul"], [None])
-
- if nodes is None:
- return False
- matmul = nodes[0]
-
- matmul_weight = self.model.get_initializer(matmul.input[1])
- matmul_bias = self.model.get_initializer(
- node.input[1]
- ) or self.model.get_initializer(node.input[0])
-
- if matmul_weight is None or matmul_bias is None:
- return False
-
- w = NumpyHelper.to_array(matmul_weight)
- b = NumpyHelper.to_array(matmul_bias)
-
- trans_matmul_weight = w.transpose(1, 0)
- if matmul_weight.name not in self.model.initializer_visited.keys():
- self.model.initializer_visited[matmul_weight.name] = True
- if matmul_weight.data_type == 10:
- matmul_weight.CopyFrom(
- numpy_helper.from_array(
- trans_matmul_weight.astype(np.float16), matmul_weight.name
- )
- )
- else:
- matmul_weight.CopyFrom(
- numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
- )
-
- if matmul_bias.data_type == 10:
- matmul_bias.CopyFrom(
- numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
- )
- else:
- matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
-
- fused_node = helper.make_node(
- "CustomFCPluginDynamic_IxRT",
- inputs=[matmul.input[0]],
- outputs=node.output,
- name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
- fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- self.nodes_to_add.append(fused_node)
- self.nodes_to_remove.extend([matmul, node])
- return True
-
-
-class FusionCustomFCActivation(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model,
- "CustomFCPluginDynamic_IxRT",
- ["Gelu", "Relu", "CustomGeluPluginDynamic_IxRT", "Mul"],
- "with activation",
- )
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- if node.op_type == "Mul":
- return_indice = []
- nodes = self.model.match_parent_path(
- node,
- ["Sigmoid", "Mul", "CustomFCPluginDynamic_IxRT"],
- [None, 0, 0],
- return_indice=return_indice,
- )
- if nodes is None:
- return
-
- (sigmoid_node, mul_node, custom_fc_node) = nodes
- if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
- return
-
- activation_type = 20
- for attr in custom_fc_node.attribute:
- if attr.name == "act_type":
- attr.i = activation_type
- break
-
- custom_fc_node.output[0] = node.output[0]
- self.nodes_to_add.append(custom_fc_node)
- self.nodes_to_remove.extend([node, sigmoid_node, mul_node, custom_fc_node])
- self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
- else:
- nodes = self.model.match_parent_path(
- node, ["CustomFCPluginDynamic_IxRT"], [0]
- )
-
- if nodes is None:
- logger.debug("CustomFCActivation: failed to match fc+gelu/relu path")
- return
-
- fc_node = nodes[0]
- activation_type = 3
- if node.op_type == "Gelu":
- activation_type = 3
- if node.op_type == "Relu":
- activation_type = 4
-
- for attr in fc_node.attribute:
- if attr.name == "act_type":
- attr.i = activation_type
- break
-
- fc_node.output[0] = node.output[0]
- self.nodes_to_add.append(fc_node)
- self.nodes_to_remove.extend([node, fc_node])
- self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
-
-
-class FusionConformerCustomFCActivation(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model,
- "CustomFCPluginDynamic_IxRT",
- ["Mul"],
- "with activation",
- )
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- # return_indice = []
- nodes = self.model.match_parent_path(
- node,
- ["Sigmoid", "CustomFCPluginDynamic_IxRT"],
- [
- None,
- 0,
- ],
- # return_indice=return_indice,
- )
- if nodes is None:
- return
- (sigmoid_node, custom_fc_node) = nodes
- # if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
- # return
- activation_type = 20
- for attr in custom_fc_node.attribute:
- if attr.name == "act_type":
- attr.i = activation_type
- break
- custom_fc_node.attribute.extend([helper.make_attribute("swish_alpha", 1.0)])
- custom_fc_node.output[0] = node.output[0]
- self.nodes_to_add.append(custom_fc_node)
- self.nodes_to_remove.extend([node, sigmoid_node, custom_fc_node])
- self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
-
-
-class FusionTorchvisionVitCustomFC(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "CustomFCPluginDynamic_IxRT", ["CustomQKVToContextPluginDynamic_IxRT"], "torchvision vit custom_fc",)
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- custom_fc_node_0 = self.model.get_children(node, input_name_to_nodes)
- transpose_node_0 = self.model.get_children(custom_fc_node_0[0], input_name_to_nodes)
-
- if transpose_node_0[0].op_type != "Transpose":
- return
-
- custom_fc_node_0[0].output[0] = transpose_node_0[0].output[0]
-
- nodes = self.model.match_parent_path(node, ["CustomFCPluginDynamic_IxRT","Transpose"], [0, 0])
- if nodes is None:
- return
-
- (custom_fc_node_1, transpose_node_1) = nodes
- custom_fc_node_1.input[0] = transpose_node_1.input[0]
-
- self.nodes_to_add.append(custom_fc_node_1)
- self.nodes_to_add.append(custom_fc_node_0[0])
- self.nodes_to_remove.extend([transpose_node_1, custom_fc_node_1, transpose_node_0[0], custom_fc_node_0[0]])
- self.node_name_to_graph_name[custom_fc_node_1.name] = self.this_graph_name
- self.node_name_to_graph_name[custom_fc_node_0[0].name] = self.this_graph_name
-
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
deleted file mode 100644
index 670a767e18e3ccd13d5540c9a415aa3ad8fc7525..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionDisentangledAttention(Fusion):
- """
- Match Disentangled Attention
- -------------------------------------------
- |
- GatherElements --> Add --> Add -->
- |
- GatherElements --> Transpose ->
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(model, "DisentangledAttention_IxRT", "Add")
-
- def create_disentangled_attention_node(
- self,
- inputs: List[str],
- outputs: List[str],
- ) -> Union[NodeProto, None]:
- """Create an disentangled attention node.
-
- Args:
- inputs List[str]: data input names
- outputs List[str]: data output names
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- disentangled_attention_node_name = self.model.create_node_name(
- "DisentangledAttention"
- )
-
- disentangled_attention_node = helper.make_node(
- "DisentangledAttention_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=disentangled_attention_node_name,
- )
- disentangled_attention_node.domain = "com.iluvatar"
- disentangled_attention_node.attribute.extend(
- [helper.make_attribute("plugin_namespace", "")]
- )
- disentangled_attention_node.attribute.extend(
- [helper.make_attribute("plugin_version", "1")]
- )
- disentangled_attention_node.attribute.extend(
- [helper.make_attribute("factor", 0.1)]
- )
- disentangled_attention_node.attribute.extend(
- [helper.make_attribute("span", 512)]
- )
-
- return disentangled_attention_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- disentangled_attention_path1 = {
- "path": (["Add", "GatherElements", "MatMul"], [None, None, None]),
- }
-
- disentangled_attention_path2 = {
- "path": (
- ["Add", "Transpose", "GatherElements", "MatMul"],
- [None, None, None, None],
- ),
- }
-
- nodes1, _ = self.match_parent_path_from_dict(node, disentangled_attention_path1)
- nodes2, _ = self.match_parent_path_from_dict(node, disentangled_attention_path2)
-
- if nodes1 is not None and nodes2 is not None:
- if nodes1[0] == nodes2[0]:
- (head_add, first_gather, first_matmul) = nodes1
- (_, transpose, second_gather, second_matmul) = nodes2
- tail_add = node
-
- first_input = [i for i in tail_add.input if i != head_add.output[0]][0]
- second_input = first_matmul.output[0]
- third_input = second_matmul.output[0]
- output = tail_add.output[0]
-
- disentangled_attention_node = self.create_disentangled_attention_node(
- [first_input, second_input, third_input], [output]
- )
- self.nodes_to_add.append(disentangled_attention_node)
- self.node_name_to_graph_name[
- disentangled_attention_node.name
- ] = self.this_graph_name
- self.nodes_to_remove.append(tail_add)
- self.nodes_to_remove.append(head_add)
- self.nodes_to_remove.append(first_gather)
- self.nodes_to_remove.append(transpose)
- self.nodes_to_remove.append(second_gather)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
deleted file mode 100644
index f46fa2c77da83612a25dd7bde215f20e70845ff7..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
+++ /dev/null
@@ -1,1078 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict, List, Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionEmbedLayerNoMask(Fusion):
- """
- Fuse embedding layer into one node (EmbedLayerNormalization).
- It supports the following model types: BERT, DistilBert, ALBert.
- """
-
- def __init__(self, model: OnnxModel, description: str = "no mask"):
- super().__init__(
- model,
- "EmbedLayerNormalization",
- ["LayerNormalization", "SkipLayerNormalization"],
- description,
- )
- self.utils = FusionUtils(model)
- self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
- # The following will be reset in each fuse call of FusionEmbedLayerNormalization
- self.attention = None
- self.embed_node = None
-
- def match_two_gather(
- self, add: NodeProto
- ) -> Union[None, Tuple[NodeProto, NodeProto]]:
- gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
- if gather_0_path is None:
- return None
-
- gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
- if gather_1_path is None:
- return None
-
- return gather_0_path[0], gather_1_path[0]
-
- def check_attention_subgraph(
- self,
- layernorm: NodeProto,
- input_name_to_nodes: Dict[str, List[NodeProto]],
- is_distil_bert: bool,
- ) -> bool:
- """Check that LayerNormalization has a child of Attention node or subgraph like Attention.
-
- Args:
- layernorm (NodeProto): LayerNormalization node
- input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
- is_distil_bert (bool): whether it is DistilBert or not
-
- Returns:
- bool: whether there is Attention node or subgraph like Attention
- """
- self.attention = self.model.find_first_child_by_type(
- layernorm, "Attention", input_name_to_nodes, recursive=False
- )
- if self.attention is None:
- # In case user disables attention fusion, check whether subgraph looks like Attention.
- if layernorm.output[0] not in input_name_to_nodes:
- return False
- children = input_name_to_nodes[layernorm.output[0]]
-
- # For Albert, there is MatMul+Add after embedding layer before attention.
- if (
- len(children) == 1
- and children[0].op_type == "MatMul"
- and children[0].output[0] in input_name_to_nodes
- ):
- grandchildren = input_name_to_nodes[children[0].output[0]]
- if (
- len(grandchildren) == 1
- and grandchildren[0].op_type == "Add"
- and grandchildren[0].output[0] in input_name_to_nodes
- ):
- nodes = input_name_to_nodes[grandchildren[0].output[0]]
- for node in nodes:
- if node.op_type == "Attention":
- self.attention = node
- return True
- children_types = sorted([child.op_type for child in nodes])
- else:
- children_types = sorted([child.op_type for child in children])
-
- # Two Shape nodes might be merged by ORT
- if is_distil_bert:
- # SkipLayerNormailization might exist when model has been optimized by ORT first.
- if (
- children_types
- != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
- and children_types
- != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
- and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
- ):
- logger.debug(
- "No Attention like subgraph in children of LayerNormalization"
- )
- return False
- else:
- if children_types != [
- "Add",
- "MatMul",
- "MatMul",
- "MatMul",
- ] and children_types != [
- "MatMul",
- "MatMul",
- "MatMul",
- "SkipLayerNormalization",
- ]:
- logger.debug(
- "No Attention like subgraph in children of LayerNormalization"
- )
- return False
- return True
-
- def match_position_embedding_distilbert(
- self, position_embedding_gather, input_ids, output_name_to_node
- ):
- """ Match position embedding path from input_ids to Gather for DistilBert.
-
- Pattern is like the following:
- (input_ids)
- |
- Shape
- | \
- | Gather (indices=1)
- | |
- | Cast (optional)
- | |
- | Range (start=0, end=*, delta=1)
- | |
- | Unsqueeze
- | /
- Expand
- |
- Gather
- """
- # remove after tests pass
- path1 = self.model.match_parent_path(
- position_embedding_gather, ["Expand", "Shape"], [1, 1]
- )
- if path1 is None:
- path1 = self.model.match_parent_path(
- position_embedding_gather,
- ["Expand", "Where", "Reshape", "Shape"],
- [1, 1, 2, 0],
- )
- if path1 is None:
- return False
-
- expand, shape = path1[0], path1[-1]
- if shape.input[0] != input_ids:
- return False
-
- _, path2, _ = self.model.match_parent_paths(
- expand,
- [
- (["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
- (["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
- ],
- output_name_to_node,
- )
- if path2 is None:
- return False
-
- range_node = path2[1]
- if not (
- self.utils.check_node_input_value(range_node, 0, 0)
- and self.utils.check_node_input_value(range_node, 2, 1)
- ):
- return False
-
- gather_node = path2[-2]
- if not (self.utils.check_node_input_value(gather_node, 1, 1)):
- return False
-
- shape_node = path2[-1]
- if shape_node.input[0] != input_ids:
- return False
-
- return True
-
- def match_position_embedding_roberta(
- self, position_embedding_gather, input_ids, output_name_to_node
- ):
- """Match position embedding path from input_ids to Gather for Roberta.
-
- Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
- (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
- | ^
- V |
- +------------------------------+
-
- Roberta new pattern from transformers v4.9:
- (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
- | ^
- V |
- +-------------------------------------------+
-
- start_node = position_embedding_gather
- start_index = 1
-
- # match optional Cast node.
- parent = self.model.get_parent(start_node, start_index, output_name_to_node)
- if parent is None:
- return
- if parent.op_type == "Cast":
- if OnnxModel.get_node_attribute(parent, "to") != 7:
- return
- start_node = parent
- start_index = 0
-
- i, path, return_indices = self.model.match_parent_paths(
- start_node,
- [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
- (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
- output_name_to_node)
-
- if path is not None:
- # constant input of Add shall be 1.
- i, value = self.model.get_constant_input(path[0])
- if value != 1:
- return False
-
- _, self.padding_word_id = self.model.get_constant_input(path[-1])
-
- return input_ids == path[-1].input[0]
- """
-
- return False
-
- def match_position_embedding_bert(
- self, position_embedding_gather, input_ids, output_name_to_node
- ):
- """ Match position embedding path from input_ids to Gather for BERT.
-
- BERT Embedding Layer Pattern:
- (input_ids)
- / \
- / Shape
- / |
- / Gather (indices=1)
- / |
- / Add (optional, B=0)
- / |
- Gather (segment_ids) Unsqueeze (axes=0)
- \ | |
- \ Gather Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
- \ / |
- Add Gather
- \ /
- Add
- |
- LayerNormalization
- """
- path = self.model.match_parent_path(
- position_embedding_gather,
- ["Slice", "Unsqueeze"],
- [1, 2],
- output_name_to_node,
- )
- if path is None:
- return False
-
- slice, unsqueeze = path
- slice_weight = self.model.get_constant_value(slice.input[0])
- if not (
- slice_weight is not None
- and len(slice_weight.shape) == 2
- and slice_weight.shape[0] == 1
- and self.utils.check_node_input_value(slice, 1, [0])
- and self.utils.check_node_input_value(slice, 3, [1])
- and (
- len(slice.input) == 4
- or self.utils.check_node_input_value(slice, 4, [1])
- )
- ):
- return False
-
- opset_version = self.model.get_opset_version()
- if opset_version < 13:
- if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
- return False
- else:
- if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
- return False
-
- node = self.model.get_parent(unsqueeze, 0, output_name_to_node)
- if node is None:
- return False
- if node.op_type == "Add":
- if not self.utils.check_node_input_value(node, 1, 0):
- return False
- gather = self.model.get_parent(node, 0, output_name_to_node)
- else:
- gather = node
-
- if gather is None or gather.op_type != "Gather":
- return False
- if not (self.utils.check_node_input_value(gather, 1, 1)):
- return False
-
- shape = self.model.get_parent(gather, 0, output_name_to_node)
- if shape is None or shape.op_type != "Shape":
- return False
-
- return input_ids == shape.input[0]
-
- def match_position_embedding(
- self, position_embedding_gather, input_ids, output_name_to_node
- ):
- if self.match_position_embedding_bert(
- position_embedding_gather, input_ids, output_name_to_node
- ):
- return True
-
- # TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
- # related: https://github.com/huggingface/transformers/issues/10736
- # if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
- # return True
-
- if self.match_position_embedding_distilbert(
- position_embedding_gather, input_ids, output_name_to_node
- ):
- return True
-
- return False
-
- def check_embedding(
- self, word_embedding_gather, segment_embedding_gather, position_embedding_gather
- ):
- """Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
- input_ids = word_embedding_gather.input[1]
- segment_ids = (
- segment_embedding_gather.input[1] if segment_embedding_gather else None
- )
- position_ids = position_embedding_gather.input[1]
-
- if self.shape_infer_helper is not None:
- input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
- position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
- assert input_ids_shape and position_ids_shape
- if not (
- len(input_ids_shape) == 2
- and len(position_ids_shape) == 2
- and input_ids_shape[1] == position_ids_shape[1]
- ):
- logger.info(
- "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
- input_ids_shape, position_ids_shape
- )
- )
- return False
-
- if segment_ids and not self.shape_infer_helper.compare_shape(
- input_ids, segment_ids
- ):
- logger.info(
- "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
- input_ids_shape,
- self.shape_infer_helper.get_edge_shape(segment_ids),
- )
- )
- return False
-
- word_embedding_table = self.model.get_constant_value(
- word_embedding_gather.input[0]
- )
- if word_embedding_table is None or len(word_embedding_table.shape) != 2:
- logger.info(
- "Cannot fuse EmbedLayerNormalization: word embedding table is not expected"
- )
- return False
-
- position_embedding_table = self.model.get_constant_value(
- position_embedding_gather.input[0]
- )
- if (
- position_embedding_table is None
- or len(position_embedding_table.shape) != 2
- or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
- ):
- logger.info(
- "Cannot fuse EmbedLayerNormalization: position embedding table is not expected"
- )
- return False
-
- if segment_ids:
- segment_embedding_table = self.model.get_constant_value(
- segment_embedding_gather.input[0]
- )
- if (
- segment_embedding_table is None
- or len(segment_embedding_table.shape) != 2
- or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
- ):
- logger.info(
- "Cannot fuse EmbedLayerNormalization: segment embedding table is not expected"
- )
- return False
-
- # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between.
- # TODO: use other information (like initializer names) to identify different embedding weights automatically.
- if word_embedding_table.shape[0] <= position_embedding_table.shape[0]:
- logger.warning(
- f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]}"
- )
-
- if segment_ids:
- if word_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
- logger.warning(
- f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
- )
-
- if position_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
- logger.warning(
- f"position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
- )
-
- return True
-
- def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]:
- """Cast a graph input or node input to int32.
-
- Args:
- input_name (str): name of graph input or node input
-
- Returns:
- A tuple of casted input name and the cast node.
- int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
- input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
- """
- input_cast_node = None
- graph_input = self.model.find_graph_input(input_name)
- if graph_input is not None:
- if graph_input.type.tensor_type.elem_type != TensorProto.INT32:
- int32_output, input_cast_node = self.utils.cast_input_to_int32(
- input_name
- )
- else:
- int32_output = input_name
- else:
- int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
-
- return int32_output, input_cast_node
-
- def create_fused_node(
- self,
- input_ids: str,
- layernorm: NodeProto,
- word_embedding_gather: NodeProto,
- position_embedding_gather: NodeProto,
- segment_embedding_gather: Union[None, NodeProto],
- position_ids: str = None,
- embedding_sum_output=False,
- ):
- """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
-
- Args:
- input_ids (str): input_ids for word embeddings
- layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
- word_embedding_gather (NodeProto): the Gather node for word embedding
- position_embedding_gather (NodeProto): the Gather node for position embedding
- segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.
-
- Returns:
- NodeProto: the EmbedLayerNormalization node created.
- """
- nodes_to_add = []
- input_ids, _ = self.cast_to_int32(input_ids)
-
- node_name = self.model.create_node_name("EmbedLayerNormalization")
-
- if layernorm.op_type == "LayerNormalization":
- gamma = layernorm.input[1]
- beta = layernorm.input[2]
- else: # SkipLayerNormalization
- gamma = layernorm.input[2]
- beta = layernorm.input[3]
-
- embed_node_inputs = None
- if segment_embedding_gather is not None:
- segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
-
- embed_node_inputs = [
- input_ids,
- segment_ids,
- word_embedding_gather.input[0],
- position_embedding_gather.input[0],
- segment_embedding_gather.input[0],
- gamma,
- beta,
- ]
- else: # no segment embedding
- embed_node_inputs = [
- input_ids,
- "",
- word_embedding_gather.input[0],
- position_embedding_gather.input[0],
- "",
- gamma,
- beta,
- ]
-
- if position_ids is not None:
- # Adding an empty input for mask before position_ids
- embed_node_inputs.append("")
- position_ids, _ = self.cast_to_int32(position_ids)
- embed_node_inputs.append(position_ids)
-
- embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"]
- if embedding_sum_output:
- embed_node_outputs.append(node_name + "_embedding_sum")
-
- embed_node = helper.make_node(
- "EmbedLayerNormalization",
- embed_node_inputs,
- outputs=embed_node_outputs,
- name=node_name,
- )
-
- embed_node.domain = "com.microsoft"
-
- # Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
- for att in layernorm.attribute:
- if att.name == "epsilon":
- embed_node.attribute.extend([att])
-
- # Set default value to 1e-12 if no attribute is found.
- # OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
- if len(embed_node.attribute) == 0:
- embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
-
- # Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
- nodes_to_add.append(embed_node)
- for node in nodes_to_add:
- self.node_name_to_graph_name[node.name] = self.this_graph_name
- self.nodes_to_add.extend(nodes_to_add)
-
- self.embed_node = embed_node
- return embed_node
-
- def finish_fusion(self, layernorm, embed_node):
- self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
- # use prune graph to remove nodes that is not needed
- self.prune_graph = True
-
- def is_embedding_sum_needed(self, add_before_layer_norm):
- """Check that Add before layer norm has an output to add before next layernorm
-
- Args:
- add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph
-
- Returns:
- bool: whether there is an extra output needed out of embed layer norm node
- """
-
- nodes = self.model.get_children(add_before_layer_norm)
-
- return len(nodes) > 1
-
- def fuse_gpt2(
- self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node
- ):
- # graph checks
- # gpt2 has no segment embedding, subgraph pattern is like
- # input_ids position_ids
- # | |
- # Gather Gather
- # \ /
- # Add _ _ _ _ _
- # | |
- # LayerNormalization |
- # | |
- # Attention |
- # | |
- # Matmul |
- # | /
- # Add /
- # \ /
- # Add
- two_gather = self.match_two_gather(add_before_layernorm)
- if two_gather is None:
- return False
-
- add_output = add_before_layernorm.output[0]
-
- word_embedding_gather, position_embedding_gather = two_gather
- input_ids = word_embedding_gather.input[1]
- position_ids = position_embedding_gather.input[1]
-
- if not self.check_attention_subgraph(
- layernorm, input_name_to_nodes, is_distil_bert=False
- ):
- return False
-
- if not self.check_embedding(
- word_embedding_gather, None, position_embedding_gather
- ):
- return False
-
- optional_embedding_sum_output = False
- if self.is_embedding_sum_needed(add_before_layernorm):
- optional_embedding_sum_output = True
-
- # make the fused node
- embed_node = self.create_fused_node(
- input_ids,
- layernorm,
- word_embedding_gather,
- position_embedding_gather,
- None,
- position_ids,
- optional_embedding_sum_output,
- )
-
- # direct the output to another add too
- self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
- if optional_embedding_sum_output:
- self.model.replace_input_of_all_nodes(add_output, embed_node.output[2])
-
- return True
-
- def fuse_distilbert(
- self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node
- ):
- """Fuse embedding layer for DistilBert
- Args:
- layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
- add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
- input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
- output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
- """
-
- # DistilBert has no segment embedding, subgraph pattern is like
- # input_ids
- # | \
- # | (position_embedding_subgraph)
- # | |
- # Gather Gather
- # \ /
- # Add
- # |
- # LayerNormalization
- two_gather = self.match_two_gather(add_before_layernorm)
- if two_gather is None:
- return False
-
- word_embedding_gather, position_embedding_gather = two_gather
- input_ids = word_embedding_gather.input[1]
-
- if not self.check_attention_subgraph(
- layernorm, input_name_to_nodes, is_distil_bert=True
- ):
- return False
-
- if not self.match_position_embedding(
- position_embedding_gather, input_ids, output_name_to_node
- ):
- return False
-
- if not self.check_embedding(
- word_embedding_gather, None, position_embedding_gather
- ):
- return False
-
- embed_node = self.create_fused_node(
- input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
- )
- self.finish_fusion(layernorm, embed_node)
- return True
-
- def fuse_bert(
- self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node
- ):
- """Fuse embedding layer for Bert
- Args:
- layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
- add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
- input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
- output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
- """
-
- add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
- if add_2_gather is None:
- return False
-
- two_gather = self.match_two_gather(add_2_gather[0])
- if two_gather is None:
- return False
-
- word_embedding_gather, segment_embedding_gather = two_gather
-
- input_ids = word_embedding_gather.input[1]
-
- if not self.check_attention_subgraph(
- layernorm, input_name_to_nodes, is_distil_bert=False
- ):
- return False
-
- position_embedding_path = self.model.match_parent_path(
- add_before_layernorm, ["Gather"], [1]
- )
- if position_embedding_path is None:
- return False
-
- position_embedding_gather = position_embedding_path[0]
- if not self.match_position_embedding(
- position_embedding_gather, input_ids, output_name_to_node
- ):
- if not self.match_position_embedding(
- segment_embedding_gather, input_ids, output_name_to_node
- ):
- return False
- # position and segment are switched
- temp = segment_embedding_gather
- segment_embedding_gather = position_embedding_gather
- position_embedding_gather = temp
-
- if not self.check_embedding(
- word_embedding_gather, segment_embedding_gather, position_embedding_gather
- ):
- return False
-
- embed_node = self.create_fused_node(
- input_ids,
- layernorm,
- word_embedding_gather,
- position_embedding_gather,
- segment_embedding_gather,
- )
- self.finish_fusion(layernorm, embed_node)
- return True
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- if node.op_type == "LayerNormalization":
- first_add_path = self.model.match_parent_path(node, ["Add"], [0])
- if first_add_path is None:
- return
- add_before_layernorm = first_add_path[0]
- else: # SkipLayerNormalization
- add_before_layernorm = node # Add is fused into SkipLayerNormalization
-
- if self.fuse_gpt2(
- node, add_before_layernorm, input_name_to_nodes, output_name_to_node
- ):
- return
-
- if self.fuse_distilbert(
- node, add_before_layernorm, input_name_to_nodes, output_name_to_node
- ):
- return
-
- if self.fuse_bert(
- node, add_before_layernorm, input_name_to_nodes, output_name_to_node
- ):
- return
-
-
-class FusionEmbedLayerNormalization(FusionEmbedLayerNoMask):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "with mask")
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- # Reset attention and embed_node so that we know fusion is successful when they are not None.
- self.attention = None
- self.embed_node = None
- super().fuse(node, input_name_to_nodes, output_name_to_node)
-
- if self.attention and self.embed_node:
- mask_index = self.attention.input[3]
- if mask_index in output_name_to_node:
- node = output_name_to_node[mask_index]
- if node.op_type == "ReduceSum":
- embed_node = self.embed_node
- mask_input_name = node.input[0]
- self.nodes_to_remove.extend([node])
- embed_node.input.append(mask_input_name)
- embed_node.output[1] = mask_index
-
-
-class FusionBertEmbedLayerNormalization(Fusion):
- """
- Fuse BertEmbedLayerNormalization subgraph into one node.
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "CustomEmbLayerNormPluginDynamic_IxRT", "CustomQKVToContextPluginDynamic_IxRT"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- input --> CustomEmbLayerNormPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT --> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT
- """
- children = self.model.get_children(node, input_name_to_nodes)
- parent = self.model.get_parents(node, output_name_to_node)
-
- if len(children) == 0:
- return
- if len(parent) == 0:
- return
-
- start_node = node
-
- # word_embeddings
- word_embeddings_node = self.model.match_parent_path(
- start_node,
- ["CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
- [0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- # token_type_embeddings
- token_type_embeddings_node = self.model.match_parent_path(
- start_node,
- ["CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
- [0, 0, 0, 0, 1],
- output_name_to_node,
- )
-
- # attention_mask
- attention_mask_node = self.model.match_parent_path(
- start_node,
- ["Mul", "Sub", "Cast", "Unsqueeze"],
- [1, 0, 1, 0],
- output_name_to_node,
- )
-
- if word_embeddings_node is None or token_type_embeddings_node is None or attention_mask_node is None:
- return
-
- if word_embeddings_node and token_type_embeddings_node and attention_mask_node:
- subgraph_nodes = []
- subgraph_nodes.extend(word_embeddings_node)
- subgraph_nodes.extend(token_type_embeddings_node)
- subgraph_nodes.extend(attention_mask_node)
-
- subgraph_nodes_unique = []
- for item in subgraph_nodes:
- if item not in subgraph_nodes_unique:
- subgraph_nodes_unique.append(item)
- subgraph_nodes_remove = []
- for item in subgraph_nodes_unique:
- if item.op_type != "CustomFCPluginDynamic_IxRT":
- subgraph_nodes_remove.append(item)
-
- # input_ids = self.model.get_graph_inputs_excluding_initializers()[0]
- # token_type_ids = self.model.get_graph_inputs_excluding_initializers()[1]
- # attention_mask = self.model.get_graph_inputs_excluding_initializers()[2]
-
- emblayernorm_out = word_embeddings_node[1].output[0]
- emblayernorm_out_mask = attention_mask_node[0].output[0]
-
- # self.model.modify_node_output_type(emblayernorm_out_mask, 5)
-
- beta_data = self.model.get_initializer(word_embeddings_node[1].input[2], True)
- embeddings_layernorm_beta_name = "bert_embeddings_layernorm_beta"
- embeddings_layernorm_beta = helper.make_tensor(
- embeddings_layernorm_beta_name, TensorProto.FLOAT, beta_data.shape, beta_data.flatten().tolist())
-
- gamma_data = self.model.get_initializer(word_embeddings_node[1].input[1], True)
- embeddings_layernorm_gamma_name = "bert_embeddings_layernorm_gamma"
- embeddings_layernorm_gamma = helper.make_tensor(
- embeddings_layernorm_gamma_name, TensorProto.FLOAT, gamma_data.shape, gamma_data.flatten().tolist())
-
- embeddings_word_embeddings_data = self.model.get_initializer(word_embeddings_node[4].input[0], True)
- embeddings_word_embeddings_name = "bert_embeddings_word_embeddings"
- embeddings_word_embeddings = helper.make_tensor(
- embeddings_word_embeddings_name, TensorProto.FLOAT, embeddings_word_embeddings_data.shape,
- embeddings_word_embeddings_data.flatten().tolist())
-
- embeddings_token_type_embeddings_data = self.model.get_initializer(token_type_embeddings_node[4].input[0], True)
- embeddings_token_type_embeddings_name = "bert_embeddings_token_type_embeddings"
- embeddings_token_type_embeddings = helper.make_tensor(
- embeddings_token_type_embeddings_name, TensorProto.FLOAT, embeddings_token_type_embeddings_data.shape,
- embeddings_token_type_embeddings_data.flatten().tolist())
-
- embeddings_position_embeddings_data = self.model.get_initializer(token_type_embeddings_node[2].input[1], True)
- embeddings_position_embeddings_name = "bert_embeddings_token_type_embeddings"
- embeddings_position_embeddings = helper.make_tensor(
- embeddings_position_embeddings_name, TensorProto.FLOAT, embeddings_position_embeddings_data.shape,
- embeddings_position_embeddings_data.flatten().tolist())
-
- self.model.add_initializer(embeddings_layernorm_beta, self.this_graph_name)
- self.model.add_initializer(embeddings_layernorm_gamma, self.this_graph_name)
- self.model.add_initializer(embeddings_word_embeddings, self.this_graph_name)
- self.model.add_initializer(embeddings_token_type_embeddings, self.this_graph_name)
- self.model.add_initializer(embeddings_position_embeddings, self.this_graph_name)
-
-
- emblayernorm_node = helper.make_node(
- "CustomEmbLayerNormPluginDynamic_IxRT",
- inputs=[word_embeddings_node[4].input[1], token_type_embeddings_node[4].input[1], attention_mask_node[3].input[0]],
- outputs=[emblayernorm_out, emblayernorm_out_mask],
- name=self.model.create_node_name(
- "BertEmbedLayerNormalization", name_prefix="BertEmbedLayerNormalization"
- ),
- )
- emblayernorm_node.domain = "com.iluvatar"
- emblayernorm_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- emblayernorm_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- emblayernorm_node.attribute.extend([helper.make_attribute("output_fp16", 1)])
- emblayernorm_node.attribute.extend([helper.make_attribute("full_mask", 1)])
- emblayernorm_node.attribute.extend([helper.make_attribute("mha_type_id", 2)])
- emblayernorm_node.attribute.extend([helper.make_attribute("pad_id", 0)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_beta", embeddings_layernorm_beta)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_gamma", embeddings_layernorm_gamma)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_word_embeddings", embeddings_word_embeddings)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_token_type_embeddings", embeddings_token_type_embeddings)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_position_embeddings", embeddings_position_embeddings)])
-
- self.nodes_to_remove.extend(subgraph_nodes_remove)
-
- self.nodes_to_add.append(emblayernorm_node)
- self.node_name_to_graph_name[emblayernorm_node.name] = self.this_graph_name
-
-
-class FusionAlbertEmbedLayerNormalization(Fusion):
- """
- Fuse AlbertEmbedLayerNormalization subgraph into one node.
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "CustomEmbLayerNormPluginDynamic_IxRT", "CustomQKVToContextPluginDynamic_IxRT"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- input --> CustomEmbLayerNormPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT --> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT
- """
- children = self.model.get_children(node, input_name_to_nodes)
- parent = self.model.get_parents(node, output_name_to_node)
-
- if len(children) == 0:
- return
- if len(parent) == 0:
- return
-
- start_node = node
-
- # word_embeddings
- word_embeddings_node = self.model.match_parent_path(
- start_node,
- ["CustomFCPluginDynamic_IxRT","CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
- [0, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- # token_type_embeddings
- token_type_embeddings_node = self.model.match_parent_path(
- start_node,
- ["CustomFCPluginDynamic_IxRT","CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
- [0, 0, 0, 0, 0, 1],
- output_name_to_node,
- )
-
- # attention_mask
- attention_mask_node = self.model.match_parent_path(
- start_node,
- ["Mul", "Sub", "Cast", "Unsqueeze"],
- [1, 0, 1, 0],
- output_name_to_node,
- )
-
- if word_embeddings_node is None or token_type_embeddings_node is None or attention_mask_node is None:
- return
-
- if word_embeddings_node and token_type_embeddings_node and attention_mask_node:
- subgraph_nodes = []
- subgraph_nodes.extend(word_embeddings_node)
- subgraph_nodes.extend(token_type_embeddings_node)
- subgraph_nodes.extend(attention_mask_node)
-
- subgraph_nodes_unique = []
- for item in subgraph_nodes:
- if item not in subgraph_nodes_unique:
- subgraph_nodes_unique.append(item)
- subgraph_nodes_remove = []
- for item in subgraph_nodes_unique:
- if item.op_type != "CustomFCPluginDynamic_IxRT":
- subgraph_nodes_remove.append(item)
-
- # input_ids = self.model.get_graph_inputs_excluding_initializers()[0]
- # token_type_ids = self.model.get_graph_inputs_excluding_initializers()[1]
- # attention_mask = self.model.get_graph_inputs_excluding_initializers()[2]
-
- emblayernorm_out = word_embeddings_node[2].output[0]
- emblayernorm_out_mask = attention_mask_node[0].output[0]
-
- beta_data = self.model.get_initializer(word_embeddings_node[2].input[2], True)
- embeddings_layernorm_beta_name = "bert_embeddings_layernorm_beta"
- embeddings_layernorm_beta = helper.make_tensor(
- embeddings_layernorm_beta_name, TensorProto.FLOAT, beta_data.shape, beta_data.flatten().tolist())
-
- gamma_data = self.model.get_initializer(word_embeddings_node[2].input[1], True)
- embeddings_layernorm_gamma_name = "bert_embeddings_layernorm_gamma"
- embeddings_layernorm_gamma = helper.make_tensor(
- embeddings_layernorm_gamma_name, TensorProto.FLOAT, gamma_data.shape, gamma_data.flatten().tolist())
-
- embeddings_word_embeddings_data = self.model.get_initializer(word_embeddings_node[5].input[0], True)
- embeddings_word_embeddings_name = "bert_embeddings_word_embeddings"
- embeddings_word_embeddings = helper.make_tensor(
- embeddings_word_embeddings_name, TensorProto.FLOAT, embeddings_word_embeddings_data.shape,
- embeddings_word_embeddings_data.flatten().tolist())
-
- embeddings_token_type_embeddings_data = self.model.get_initializer(token_type_embeddings_node[5].input[0], True)
- embeddings_token_type_embeddings_name = "bert_embeddings_token_type_embeddings"
- embeddings_token_type_embeddings = helper.make_tensor(
- embeddings_token_type_embeddings_name, TensorProto.FLOAT, embeddings_token_type_embeddings_data.shape,
- embeddings_token_type_embeddings_data.flatten().tolist())
-
- embeddings_position_embeddings_data = self.model.get_initializer(token_type_embeddings_node[3].input[1], True)
- embeddings_position_embeddings_name = "bert_embeddings_token_type_embeddings"
- embeddings_position_embeddings = helper.make_tensor(
- embeddings_position_embeddings_name, TensorProto.FLOAT, embeddings_position_embeddings_data.shape,
- embeddings_position_embeddings_data.flatten().tolist())
-
- self.model.add_initializer(embeddings_layernorm_beta, self.this_graph_name)
- self.model.add_initializer(embeddings_layernorm_gamma, self.this_graph_name)
- self.model.add_initializer(embeddings_word_embeddings, self.this_graph_name)
- self.model.add_initializer(embeddings_token_type_embeddings, self.this_graph_name)
- self.model.add_initializer(embeddings_position_embeddings, self.this_graph_name)
-
- emblayernorm_node = helper.make_node(
- "CustomEmbLayerNormPluginDynamic_IxRT",
- inputs=[word_embeddings_node[5].input[1], token_type_embeddings_node[5].input[1], attention_mask_node[3].input[0]],
- outputs=[emblayernorm_out, emblayernorm_out_mask],
- name=self.model.create_node_name(
- "BertEmbedLayerNormalization", name_prefix="BertEmbedLayerNormalization"
- ),
- )
- emblayernorm_node.domain = "com.iluvatar"
- emblayernorm_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- emblayernorm_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- emblayernorm_node.attribute.extend([helper.make_attribute("output_fp16", 1)])
- emblayernorm_node.attribute.extend([helper.make_attribute("full_mask", 1)])
- emblayernorm_node.attribute.extend([helper.make_attribute("mha_type_id", 2)])
- emblayernorm_node.attribute.extend([helper.make_attribute("pad_id", 0)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_beta", embeddings_layernorm_beta)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_gamma", embeddings_layernorm_gamma)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_word_embeddings", embeddings_word_embeddings)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_token_type_embeddings", embeddings_token_type_embeddings)])
- emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_position_embeddings", embeddings_position_embeddings)])
-
- self.nodes_to_remove.extend(subgraph_nodes_remove)
-
- self.nodes_to_add.append(emblayernorm_node)
- self.node_name_to_graph_name[emblayernorm_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
deleted file mode 100644
index 067ff26e4eb51ea0df3ad6b49318179afd3b4177..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict, Optional
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionFastGelu(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "CustomGeluPluginDynamic_IxRT", "Tanh")
-
- def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node):
- return
-
- if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node):
- return
-
- if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node):
- return
-
- def fuse_1(
- self, tanh_node, input_name_to_nodes, output_name_to_node
- ) -> Optional[bool]:
- """
- Fuse Gelu with tanh into one node:
- +---------------------------+
- | |
- | v
- [root] --> Pow --> Mul -----> Add --> Mul --> Tanh --> Add --> Mul
- | (Y=3) (B=0.0447...) (B=0.7978...) (B=1) ^
- | |
- +------> Mul(B=0.5)--------------------------------------------+
- Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
- """
- if tanh_node.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[tanh_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_tanh = children[0]
-
- if not self.model.has_constant_input(add_after_tanh, 1.0):
- return
-
- if add_after_tanh.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_tanh.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_after_tanh = children[0]
-
- mul_half = self.model.match_parent(
- mul_after_tanh, "Mul", None, output_name_to_node
- )
- if mul_half is None:
- return
-
- i = self.model.find_constant_input(mul_half, 0.5)
- if i < 0:
- return
-
- root_input = mul_half.input[0 if i == 1 else 1]
-
- # root_node could be None when root_input is graph input
- root_node = self.model.get_parent(
- mul_half, 0 if i == 1 else 1, output_name_to_node
- )
-
- mul_before_tanh = self.model.match_parent(
- tanh_node, "Mul", 0, output_name_to_node
- )
- if mul_before_tanh is None:
- return
-
- i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
- if i < 0:
- return
-
- add_before_tanh = self.model.match_parent(
- mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
- )
- if add_before_tanh is None:
- return
-
- mul_after_pow = self.model.match_parent(
- add_before_tanh,
- "Mul",
- None,
- output_name_to_node,
- exclude=[root_node] if root_node else [],
- )
- if mul_after_pow is None:
- return
-
- i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
- if i < 0:
- return
-
- pow = self.model.match_parent(
- mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
- )
- if pow is None:
- return
-
- if not self.model.has_constant_input(pow, 3.0):
- return
-
- if pow.input[0] != root_input:
- return
-
- subgraph_nodes = [
- mul_after_tanh,
- mul_half,
- add_after_tanh,
- tanh_node,
- mul_before_tanh,
- add_before_tanh,
- mul_after_pow,
- pow,
- ]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- [mul_after_tanh.output[0]],
- input_name_to_nodes,
- output_name_to_node,
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "CustomGeluPluginDynamic_IxRT",
- inputs=[root_input],
- outputs=mul_after_tanh.output,
- name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
-
- def fuse_2(
- self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
- ) -> Optional[bool]:
- """
- This pattern is from Tensorflow model.
- Fuse Gelu with tanh into one node:
- +---------------------------+
- | |
- | v
- [root] --> Pow --> Mul -----> Add --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul-->
- | (Y=3) (B=0.0447...) (B=0.7978...) (B=1) ^
- | |
- +---------------------------------------------------------------------------+
- Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
- """
- if tanh_node.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[tanh_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_tanh = children[0]
-
- if not self.model.has_constant_input(add_after_tanh, 1.0):
- return
-
- if add_after_tanh.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_tanh.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_half = children[0]
-
- i = self.model.find_constant_input(mul_half, 0.5)
- if i < 0:
- return
-
- if mul_half.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[mul_half.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_after_mul_half = children[0]
-
- root_node = self.model.get_parent(
- mul_after_mul_half,
- 0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
- output_name_to_node,
- )
- if root_node is None:
- return
-
- mul_before_tanh = self.model.match_parent(
- tanh_node, "Mul", 0, output_name_to_node
- )
- if mul_before_tanh is None:
- return
-
- i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
- if i < 0:
- return
-
- add_before_tanh = self.model.match_parent(
- mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
- )
- if add_before_tanh is None:
- return
-
- mul_after_pow = self.model.match_parent(
- add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node]
- )
- if mul_after_pow is None:
- return
-
- i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
- if i < 0:
- return
-
- pow = self.model.match_parent(
- mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
- )
- if pow is None:
- return
-
- if not self.model.has_constant_input(pow, 3.0):
- return
-
- if pow.input[0] != root_node.output[0]:
- return
-
- subgraph_nodes = [
- mul_after_mul_half,
- mul_half,
- add_after_tanh,
- tanh_node,
- mul_before_tanh,
- add_before_tanh,
- mul_after_pow,
- pow,
- ]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- [mul_after_mul_half.output[0]],
- input_name_to_nodes,
- output_name_to_node,
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "CustomGeluPluginDynamic_IxRT",
- inputs=[root_node.output[0]],
- outputs=mul_after_mul_half.output,
- name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
-
- def fuse_3(
- self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
- ) -> Optional[bool]:
- """
- OpenAI's gelu implementation, also used in Megatron:
- Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
-
- Fuse subgraph into a FastGelu node:
- +------------ Mul (B=0.79788456) -------------------+
- | |
- +-------------------------------+ |
- | | |
- | v v
- [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
- | ^
- | |
- +-----------> Mul (B=0.5) --------------------------------------------------------+
- """
- if tanh_node.output[0] not in input_name_to_nodes:
- return
-
- children = input_name_to_nodes[tanh_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_tanh = children[0]
-
- if not self.model.has_constant_input(add_after_tanh, 1.0):
- return
-
- if add_after_tanh.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_tanh.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_last = children[0]
-
- mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
- if mul_half is None:
- return
-
- i = self.model.find_constant_input(mul_half, 0.5)
- if i < 0:
- return
-
- root_input = mul_half.input[0 if i == 1 else 1]
-
- mul_before_tanh = self.model.match_parent(
- tanh_node, "Mul", 0, output_name_to_node
- )
- if mul_before_tanh is None:
- return
-
- add_1 = self.model.match_parent(
- mul_before_tanh, "Add", None, output_name_to_node
- )
- if add_1 is None:
- return
- j = self.model.find_constant_input(add_1, 1.0)
- if j < 0:
- return
-
- mul_7978 = self.model.match_parent(
- mul_before_tanh, "Mul", None, output_name_to_node
- )
- if mul_7978 is None:
- return
- k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
- if k < 0:
- return
- if mul_7978.input[0 if k == 1 else 1] != root_input:
- return
-
- mul_before_add_1 = self.model.match_parent(
- add_1, "Mul", 0 if j == 1 else 1, output_name_to_node
- )
- if mul_before_add_1 is None:
- return
-
- if mul_before_add_1.input[0] == root_input:
- another = 1
- elif mul_before_add_1.input[1] == root_input:
- another = 0
- else:
- return
-
- mul_0447 = self.model.match_parent(
- mul_before_add_1, "Mul", another, output_name_to_node
- )
- if mul_0447 is None:
- return
- m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
- if m < 0:
- return
-
- if mul_0447.input[0 if m == 1 else 1] != root_input:
- return
-
- subgraph_nodes = [
- mul_0447,
- mul_before_add_1,
- add_1,
- mul_before_tanh,
- tanh_node,
- add_after_tanh,
- mul_7978,
- mul_half,
- mul_last,
- ]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- [mul_last.output[0]],
- input_name_to_nodes,
- output_name_to_node,
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "CustomGeluPluginDynamic_IxRT",
- inputs=[root_input],
- outputs=mul_last.output,
- name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
deleted file mode 100644
index 1f60ab7628f1d700042cf1e025df5bb22fc1d641..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionRemoveUselessElementwise(Fusion):
- """
- Fusion to remove useless elementwise in roformer model.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(model, "Sqrt", "Sqrt")
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- paths = {
- "path1": (
- ["Max", "Min", "Add", "GlobalAveragePool"],
- [None, None, None, None],
- ),
- }
-
- pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths)
-
- if pool_nodes is None:
- logger.debug("GlobalAveragePool: failed searching path after pool node.")
- return
-
- max_node = pool_nodes[0]
- min_node = pool_nodes[1]
- add_node = pool_nodes[2]
- pool_node = pool_nodes[3]
- if not self.model.has_constant_input(add_node, 9.999999960041972e-13):
- return
-
- if not self.model.has_constant_input(max_node, 0):
- return
-
- max_node.input[0] = pool_node.output[0]
- self.nodes_to_remove.extend([min_node, add_node])
-
-
-class FusionFormatInvalidMask(Fusion):
- """
- Fusion to format invalid mask in roformer model.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(model, "", ["Greater"])
-
- def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
- nodes = self.model.match_parent_path(
- start_node,
- [
- "ReduceMin",
- "Cast",
- "Concat",
- "Unsqueeze",
- "Greater",
- "ReduceMin",
- "Cast",
- "Concat",
- "Unsqueeze",
- ],
- [0, 0, 0, 0, 0, 0, 0, 0, 0],
- )
-
- if nodes is None:
- logger.debug("Roformer: unable to format the mask.")
- return
-
- unsqueeze_node = nodes[-1]
-
- for node in self.model.graph().node:
- for (id, input) in enumerate(node.input):
- if start_node.output[0] == input:
- node.input[id] = unsqueeze_node.input[0]
-
- self.nodes_to_remove.extend(nodes)
- self.nodes_to_remove.extend([start_node])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
deleted file mode 100644
index 714212664e452ad7a42daa3623185d973e4bb773..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict, Optional
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionGelu(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "Gelu", "Erf")
-
- def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node):
- return
- if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node):
- return
- if self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node):
- return
- self.fuse_4(erf_node, input_name_to_nodes, output_name_to_node)
-
- def fuse_1(
- self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
- ) -> Optional[bool]:
- """
- This pattern is from PyTorch model
- Fuse Gelu with Erf into one node:
- Pattern 1:
- +-------Mul(0.5)---------------------+
- | |
- | v
- [root] --> Div -----> Erf --> Add --> Mul -->
- (B=1.4142...) (1)
-
- Pattern 2:
- +------------------------------------+
- | |
- | v
- [root] --> Div -----> Erf --> Add --> Mul -->Mul -->
- (B=1.4142...) (1) (0.5)
-
- Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
- """
- if erf_node.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[erf_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_erf = children[0]
-
- if not self.model.has_constant_input(add_after_erf, 1):
- return
-
- if add_after_erf.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_after_erf = children[0]
-
- div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
- if div is None:
- return
-
- if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
- return
-
- subgraph_input = div.input[0]
-
- another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
- if subgraph_input == mul_after_erf.input[another]: # pattern 2
- children = input_name_to_nodes[mul_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_half = children[0]
- if not self.model.has_constant_input(mul_half, 0.5):
- return
- subgraph_output = mul_half.output[0]
- else: # pattern 1
- mul_half = self.model.match_parent(
- mul_after_erf, "Mul", another, output_name_to_node
- )
- if mul_half is None:
- return
-
- if not self.model.has_constant_input(mul_half, 0.5):
- return
-
- if subgraph_input not in mul_half.input:
- return
-
- subgraph_output = mul_after_erf.output[0]
-
- subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "Gelu", inputs=[subgraph_input], outputs=[subgraph_output]
- )
- fused_node.domain = "com.microsoft"
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
-
- def fuse_2(
- self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
- ) -> Optional[bool]:
- """
- This pattern is from Keras model
- Fuse Gelu with Erf into one node:
- +------------------------------------------+
- | |
- | v
- [root] --> Div -----> Erf --> Add --> Mul -->Mul
- (B=1.4142...) (A=1) (A=0.5)
-
- Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
- """
- if erf_node.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[erf_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_erf = children[0]
-
- if not self.model.has_constant_input(add_after_erf, 1):
- return
-
- if add_after_erf.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_after_erf = children[0]
-
- if not self.model.has_constant_input(mul_after_erf, 0.5):
- return
-
- if mul_after_erf.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[mul_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul = children[0]
-
- div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
- if div is None:
- return
-
- sqrt_node = None
- if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
- sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
- if sqrt_node is None:
- return
- if not self.model.has_constant_input(sqrt_node, 2.0):
- return
-
- root_node = self.model.get_parent(div, 0, output_name_to_node)
- if root_node is None:
- return
-
- if root_node.output[0] not in mul.input:
- return
-
- subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
- if sqrt_node:
- subgraph_nodes.append(sqrt_node)
-
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]]
- )
- fused_node.domain = "com.microsoft"
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
-
- def fuse_3(
- self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
- ) -> Optional[bool]:
- """
- This pattern is from TensorFlow model
- Fuse Gelu with Erf into one node:
- +----------------------------------------------+
- | |
- | v
- [root] --> Mul -----> Erf --> Add --> Mul -->Mul
- (A=0.7071067690849304) (B=1) (B=0.5)
-
- Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
- """
-
- if erf_node.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[erf_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_erf = children[0]
-
- if not self.model.has_constant_input(add_after_erf, 1):
- return
-
- if add_after_erf.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_half = children[0]
-
- if not self.model.has_constant_input(mul_half, 0.5):
- return
-
- first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
- if first_mul is None:
- return
-
- i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
- if i < 0:
- return
-
- root_node = self.model.get_parent(
- first_mul, 0 if i == 1 else 1, output_name_to_node
- )
- if root_node is None:
- return
-
- if mul_half.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[mul_half.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- last_mul = children[0]
-
- if not (
- last_mul.input[0] == root_node.output[0]
- or last_mul.input[1] == root_node.output[0]
- ):
- return
-
- subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- [last_mul.output[0]],
- input_name_to_nodes,
- output_name_to_node,
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]]
- )
- fused_node.domain = "com.microsoft"
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
-
- def fuse_4(
- self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
- ) -> Optional[bool]:
- """
- This pattern is from TensorFlow model
- Fuse Gelu with Erf into one node:
- Pattern 1:
- +-------Mul(0.5)---------------------+
- | |
- | v
- [root] --> Mul -----> Erf --> Add --> Mul -->
- (B=0.7071...) (1)
-
- Pattern 2:
- +------------------------------------+
- | |
- | v
- [root] --> Mul -----> Erf --> Add --> Mul -->Mul -->
- (B=0.7071...) (1) (0.5)
-
- Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
- """
- if erf_node.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[erf_node.output[0]]
- if len(children) != 1 or children[0].op_type != "Add":
- return
- add_after_erf = children[0]
-
- if not self.model.has_constant_input(add_after_erf, 1):
- return
-
- if add_after_erf.output[0] not in input_name_to_nodes:
- return
- children = input_name_to_nodes[add_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_after_erf = children[0]
-
- mul_before_erf = self.model.match_parent(
- erf_node, "Mul", 0, output_name_to_node
- )
- if mul_before_erf is None:
- return
-
- if self.model.find_constant_input(mul_before_erf, 0.7071, delta=0.001) != 1:
- return
-
- subgraph_input = mul_before_erf.input[0]
-
- another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
- if subgraph_input == mul_after_erf.input[another]: # pattern 2
- children = input_name_to_nodes[mul_after_erf.output[0]]
- if len(children) != 1 or children[0].op_type != "Mul":
- return
- mul_half = children[0]
- if not self.model.has_constant_input(mul_half, 0.5):
- return
- subgraph_output = mul_half.output[0]
- else: # pattern 1
- mul_half = self.model.match_parent(
- mul_after_erf, "Mul", another, output_name_to_node
- )
- if mul_half is None:
- return
-
- if not self.model.has_constant_input(mul_half, 0.5):
- return
-
- if subgraph_input not in mul_half.input:
- return
-
- subgraph_output = mul_after_erf.output[0]
-
- subgraph_nodes = [
- mul_before_erf,
- erf_node,
- add_after_erf,
- mul_after_erf,
- mul_half,
- ]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- fused_node = helper.make_node(
- "Gelu", inputs=[subgraph_input], outputs=[subgraph_output]
- )
- fused_node.domain = "com.microsoft"
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
deleted file mode 100644
index a89e558cb76aa8208e4a19983f038e9f3584ffdb..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-
-class FusionGeluApproximation(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- new_node = helper.make_node(
- "FastGelu",
- inputs=node.input,
- outputs=node.output,
- name=self.model.create_node_name(
- "FastGelu", node.op_type + "_Approximation"
- ),
- )
- new_node.domain = "com.microsoft"
- self.nodes_to_remove.append(node)
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
deleted file mode 100644
index 805cd3bf7dfbf337a633eaa583d14833cdf86282..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-
-import numpy as np
-from onnx import TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionGptAttentionPastBase(Fusion):
- """Base class for GPT Attention Fusion with past state"""
-
- def __init__(self, model: OnnxModel, num_heads: int):
- super().__init__(model, "Attention", "LayerNormalization", "with past")
- self.num_heads = num_heads
- self.utils = FusionUtils(model)
- self.casted_attention_mask = (
- {}
- ) # map from name of attention mask to the name that casted to int32
-
- def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
- # Pattern 1:
- # {past}
- # / \
- # / \
- # Gather(axes=0, indices=0) Gather(indices=1)
- # | |
- # Transpose (perm=0,1,3,2) |
- # | |
- # Concat_k Concat_v
- # | /
- # Transpose (perm=0,1,3,2) /
- # | /
- # Unsqueeze Unsqueeze
- # \ /
- # \ /
- # Concat
- # |
- # {present}
- gather = self.model.get_parent(concat_v, 0, output_name_to_node)
- if gather.op_type != "Gather":
- logger.debug("match_past_pattern_1: expect Gather for past")
- return None
-
- if not self.model.find_constant_input(gather, 1) == 1:
- logger.debug("match_past_pattern_1: expect indices=1 for Gather of past")
- return None
- past = gather.input[0]
-
- parent = self.model.get_parent(concat_k, 0, output_name_to_node)
- if parent.op_type == "Gather":
- gather_past_k = parent
- else:
- past_k_nodes = self.model.match_parent_path(
- concat_k, ["Transpose", "Gather"], [0, 0]
- )
- if past_k_nodes is None:
- logger.debug("match_past_pattern_1: failed match Transpose and Gather")
- return None
- gather_past_k = past_k_nodes[-1]
-
- if not self.model.find_constant_input(gather_past_k, 0) == 1:
- logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past")
- return None
- past_k = gather_past_k.input[0]
- if past != past_k:
- logger.debug("match_past_pattern_1: expect past to be same")
- return None
-
- return past
-
- def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node):
- # Pattern 2:
- # Split (QKV)
- # / | |
- # / | +----------------------+
- # | |
- # | {past} |
- # | | |
- # Reshape Split Reshape
- # | / \ |
- # Transpose_k Squeeze Squeeze Transpose_v
- # | | \ /
- # +------|---+ \ /
- # | | \ /
- # Concat_k Concat_v
- # | |
- # Unsqueeze Unsqueeze
- # \ /
- # Concat
- # |
- # {present}
- #
- squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
- if squeeze.op_type != "Squeeze":
- logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
- return None
-
- split = self.model.get_parent(squeeze, 0, output_name_to_node)
- if split.op_type != "Split":
- logger.debug("match_past_pattern_2: expect Split for past path")
- return None
-
- opset_version = self.model.get_opset_version()
- if opset_version < 13:
- if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
- logger.debug(
- "match_past_pattern_2: axes != [0] for Squeeze in past path"
- )
- return None
-
- if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
- logger.debug(
- "match_past_pattern_2: split != [1, 1] for Split in past path"
- )
- return None
- else:
- if not self.utils.check_node_input_value(squeeze, 1, [0]):
- logger.debug(
- "match_past_pattern_2: axes != [0] for Squeeze in past path"
- )
- return None
-
- if not self.utils.check_node_input_value(split, 1, [1, 1]):
- logger.debug(
- "match_past_pattern_2: split != [1, 1] for Split in past path"
- )
- return None
-
- if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
- logger.debug(
- "match_past_pattern_2: attribute axis of Split are not expected in past path"
- )
- return None
- past = split.input[0]
-
- past_k_nodes = self.model.match_parent_path(
- concat_k, ["Squeeze", "Split"], [0, 0]
- )
- if past_k_nodes is None:
- logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
- return None
- past_k = past_k_nodes[-1].input[0]
-
- if past != past_k:
- logger.info("match_past_pattern_2: expect past to be same")
- return None
-
- return past
-
- def match_present(self, concat_v, input_name_to_nodes):
- unsqueeze_present_v = self.model.find_first_child_by_type(
- concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
- )
- if not unsqueeze_present_v:
- logger.info("expect unsqueeze for present")
- return None
- concat_present = self.model.find_first_child_by_type(
- unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
- )
- if not concat_present:
- logger.info("expect concat for present")
- return None
-
- present = concat_present.output[0]
- return present
-
- def cast_attention_mask(self, input_name):
- if input_name in self.casted_attention_mask:
- attention_mask_input_name = self.casted_attention_mask[input_name]
- elif self.model.find_graph_input(input_name):
- casted, attention_mask_input_name = self.utils.cast_graph_input_to_int32(
- input_name
- )
- self.casted_attention_mask[input_name] = attention_mask_input_name
- else:
- attention_mask_input_name, cast_node = self.utils.cast_input_to_int32(
- input_name
- )
- self.casted_attention_mask[input_name] = attention_mask_input_name
- return attention_mask_input_name
-
-
-class FusionGptAttention(FusionGptAttentionPastBase):
- """
- Fuse GPT-2 Attention with past state subgraph into one Attention node.
- """
-
- def __init__(self, model: OnnxModel, num_heads: int):
- super().__init__(model, num_heads)
-
- def create_attention_node(
- self,
- fc_weight,
- fc_bias,
- gemm_qkv,
- past,
- present,
- input,
- output,
- mask,
- is_unidirectional,
- ):
- attention_node_name = self.model.create_node_name("GptAttention")
- attention_node = helper.make_node(
- "Attention",
- inputs=[input, fc_weight, fc_bias, mask, past],
- outputs=[attention_node_name + "_output", present],
- name=attention_node_name,
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [
- helper.make_attribute("num_heads", self.num_heads),
- helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
- ]
- )
-
- matmul_node = helper.make_node(
- "MatMul",
- inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
- outputs=[attention_node_name + "_matmul_output"],
- name=attention_node_name + "_matmul",
- )
-
- add_node = helper.make_node(
- "Add",
- inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
- outputs=[output],
- name=attention_node_name + "_add",
- )
- self.nodes_to_add.extend([attention_node, matmul_node, add_node])
- self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
- self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
- self.node_name_to_graph_name[add_node.name] = self.this_graph_name
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- past = None
- present = None
- return_indice = []
- qkv_nodes = self.model.match_parent_path(
- normalize_node,
- ["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
- [0, None, 0, 0, 0, 0, 0],
- output_name_to_node=output_name_to_node,
- return_indice=return_indice,
- ) # yapf: disable
- if qkv_nodes is None:
- return
- (
- add_qkv,
- reshape_qkv,
- gemm_qkv,
- reshape_1,
- reshape_2,
- transpose_qkv,
- matmul_qkv,
- ) = qkv_nodes
-
- another_input = add_qkv.input[1 - return_indice[0]]
-
- v_nodes = self.model.match_parent_path(
- matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0]
- )
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
- (concat_v, transpose_v, reshape_v, split_fc) = v_nodes
-
- fc_nodes = self.model.match_parent_path(
- split_fc,
- ["Reshape", "Gemm", "Reshape", "LayerNormalization"],
- [0, 0, 0, 0],
- output_name_to_node,
- )
- if fc_nodes is None:
- fc_nodes = self.model.match_parent_path(
- split_fc,
- ["Add", "MatMul", "LayerNormalization"],
- [0, None, 0],
- output_name_to_node,
- )
- if fc_nodes is None:
- logger.debug("fuse_attention: failed to match fc path")
- return
- fc_weight = fc_nodes[1].input[1]
- i, _ = self.model.get_constant_input(fc_nodes[0])
- fc_bias = fc_nodes[0].input[i]
- else:
- fc_weight = fc_nodes[1].input[1]
- fc_bias = fc_nodes[1].input[2]
-
- layernorm_before_attention = fc_nodes[-1]
-
- if not another_input in layernorm_before_attention.input:
- logger.debug("Add and LayerNormalization shall have one same input")
- return
-
- is_unidirectional = True
- slice_mask = None
- input_mask_nodes = None
- concat_k_to_match = None
- qk_nodes = self.model.match_parent_path(
- matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0]
- )
- if qk_nodes is not None:
- (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
- mask_nodes = self.model.match_parent_path(
- sub_qk,
- [
- "Mul",
- "Sub",
- "Slice",
- "Slice",
- "Unsqueeze",
- "Sub",
- "Squeeze",
- "Slice",
- "Shape",
- "Div",
- ],
- [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
- ) # yapf: disable
- if mask_nodes is None:
- logger.debug("fuse_attention: failed to match unidirectional mask path")
- return
- div_mask = mask_nodes[-1]
- slice_mask = mask_nodes[3]
-
- if div_qk != div_mask:
- logger.debug("fuse_attention: skip since div_qk != div_mask")
- return
- else:
- # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
- i, qk_nodes, _ = self.model.match_parent_paths(
- matmul_qkv,
- [
- (["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
- (["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
- ],
- output_name_to_node,
- )
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk nodes")
- return
-
- where_qk = qk_nodes[-3]
- div_qk = qk_nodes[-2]
- matmul_qk = qk_nodes[-1]
-
- if i == 1:
- add_qk = qk_nodes[1]
- _, input_mask_nodes, _ = self.model.match_parent_paths(
- add_qk,
- [
- (
- ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
- [None, 0, 1, 0, 0, 0],
- ),
- (
- ["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
- [None, 0, 1, 0, 0],
- ),
- (
- ["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
- [None, 0, 1, 0],
- ), # useless cast and reshape are removed.
- ],
- output_name_to_node,
- ) # yapf: disable
- if input_mask_nodes is None:
- logger.debug(
- "fuse_attention: failed to match input attention mask path"
- )
- return
-
- mask_nodes = self.model.match_parent_path(
- where_qk,
- [
- "Cast",
- "Slice",
- "Slice",
- "Unsqueeze",
- "Sub",
- "Squeeze",
- "Slice",
- "Shape",
- ],
- [0, 0, 0, 1, 0, 0, 0, 0],
- output_name_to_node,
- ) # yapf: disable
- if mask_nodes is None:
- # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
- logger.debug("fuse_attention: failed to match mask path")
- return
-
- slice_mask = mask_nodes[2]
-
- div_or_concat = self.model.get_parent(
- mask_nodes[-1], 0, output_name_to_node
- )
- if div_or_concat.op_type == "Div":
- div_mask = div_or_concat
- if div_qk != div_mask:
- logger.debug("fuse_attention: skip since div_qk != div_mask")
- return
- elif div_or_concat.op_type == "Concat":
- concat_k_to_match = div_or_concat
- else:
- logger.debug("fuse_attention: failed to match mask path")
-
- # Validate that the mask data is either lower triangular (unidirectional) or all ones
- mask_data = numpy_helper.to_array(
- self.model.get_initializer(slice_mask.input[0])
- )
- if not (
- len(mask_data.shape) == 4
- and mask_data.shape[:2] == (1, 1)
- and mask_data.shape[2] == mask_data.shape[3]
- ):
- logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
- return
- if np.allclose(mask_data, np.ones_like(mask_data)):
- is_unidirectional = False
- elif not np.allclose(mask_data, np.tril(np.ones_like(mask_data))):
- logger.debug(
- "fuse_attention: skip since mask is neither lower triangular nor ones"
- )
- return
-
- q_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
- )
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
- (transpose_q, reshape_q, split_q) = q_nodes
- if split_fc != split_q:
- logger.debug("fuse_attention: skip since split_fc != split_q")
- return
-
- k_nodes = self.model.match_parent_path(
- matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0]
- )
- if k_nodes is None:
- # This pattern is from pytorch 1.7.1 and transformers 4.6.1
- k_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Transpose", "Concat", "Transpose", "Reshape", "Split"],
- [1, 0, 1, 0, 0],
- )
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
- else:
- (_, concat_k, transpose_k, reshape_k, split_k) = k_nodes
- else:
- (concat_k, transpose_k, reshape_k, split_k) = k_nodes
- if split_fc != split_k:
- logger.debug("fuse_attention: skip since split_fc != split_k")
- return
-
- if concat_k_to_match and concat_k != concat_k_to_match:
- logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
- return
-
- attention_mask_input_name = ""
- if input_mask_nodes is not None:
- input_name = input_mask_nodes[-1].input[0]
- attention_mask_input_name = self.cast_attention_mask(input_name)
-
- # Match past and present paths
- past = self.match_past_pattern_1(
- concat_k, concat_v, output_name_to_node
- ) or self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
- if past is None:
- logger.info("fuse_attention: failed to match past path")
- return
- if not self.model.find_graph_input(past):
- logger.debug("past is not graph input.")
- # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
-
- present = self.match_present(concat_v, input_name_to_nodes)
- if present is None:
- logger.info("fuse_attention: failed to match present path")
- return
- if not self.model.find_graph_output(present):
- logger.info("expect present to be graph output")
- return
-
- self.create_attention_node(
- fc_weight,
- fc_bias,
- gemm_qkv,
- past,
- present,
- layernorm_before_attention.output[0],
- reshape_qkv.output[0],
- attention_mask_input_name,
- is_unidirectional,
- )
-
- # we rely on prune_graph() to clean old subgraph nodes:
- # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
- self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
deleted file mode 100644
index 138a9c5ff495d59830ec0c7761a674d7beacb834..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-
-import numpy as np
-from onnx import TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_gpt_attention import FusionGptAttentionPastBase
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-def is_close(value, expected_value):
- return abs(value - expected_value) <= 1e-6
-
-
-class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
- """
- Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
- """
-
- def __init__(self, model: OnnxModel, num_heads: int):
- super().__init__(model, num_heads)
-
- def fuse_attention_node(
- self,
- matmul_before_split,
- add_before_split,
- past,
- present,
- input,
- reshape_qkv,
- mask,
- ):
- attention_node_name = self.model.create_node_name("GptAttention")
- int32_mask = self.cast_attention_mask(mask)
- output = reshape_qkv.output[0]
- i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
- attention_node = helper.make_node(
- "Attention",
- inputs=[
- input,
- matmul_before_split.input[1],
- add_before_split.input[i],
- int32_mask,
- past,
- ],
- outputs=[output, present],
- name=attention_node_name,
- )
- attention_node.domain = "com.microsoft"
- attention_node.attribute.extend(
- [
- helper.make_attribute("num_heads", self.num_heads),
- helper.make_attribute(
- "unidirectional", 0
- ), # unidirectional shall not be ON for 4D attention mask
- ]
- )
-
- nodes_to_add = [attention_node]
- self.nodes_to_add.extend(nodes_to_add)
-
- for node in nodes_to_add:
- self.node_name_to_graph_name[node.name] = self.this_graph_name
-
- self.nodes_to_remove.append(reshape_qkv)
-
- # we rely on prune_graph() to clean old subgraph nodes
- self.prune_graph = True
-
- def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
- mask_nodes = self.model.match_parent_path(
- sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
- ) # yapf: disable
- if mask_nodes is None:
- logger.debug("fuse_attention: failed to match unidirectional mask path")
- return None
- (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes
-
- if mul_qk.input[1] != last_slice_mask.output[0]:
- logger.debug(
- "fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]"
- )
- return None
-
- if not self.utils.check_node_input_value(mul_mask, 1, 10000.0):
- logger.debug(
- "fuse_attention failed: mul_mask input 1 is not constant 10000.0"
- )
- return None
-
- if not self.utils.check_node_input_value(sub_mask, 0, 1.0):
- logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0")
- return None
-
- if not self.model.find_graph_input(slice_mask.input[0]):
- logger.info("expect slick_mask input 0 to be graph input")
- return None
-
- if not self.utils.check_node_input_value(last_slice_mask, 1, [0]):
- logger.debug(
- "fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]"
- )
- return None
-
- if not self.utils.check_node_input_value(last_slice_mask, 3, [3]):
- logger.debug(
- "fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]"
- )
- return False
-
- if not self.utils.check_node_input_value(last_slice_mask, 4, [1]):
- logger.debug(
- "fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]"
- )
- return False
-
- if not self.utils.check_node_input_value(slice_mask, 3, [2]):
- logger.debug(
- "fuse_attention failed: slice_mask input 3 (axes) is not constant [2]"
- )
- return None
-
- if not self.utils.check_node_input_value(slice_mask, 4, [1]):
- logger.debug(
- "fuse_attention failed: slice_mask input 4 (steps) is not constant [1]"
- )
- return None
-
- last_slice_path = self.model.match_parent_path(
- last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
- )
- if last_slice_path is None or last_slice_path[-1] != matmul_qk:
- logger.debug("fuse_attention: failed to match last slice path")
- return None
-
- first_slice_path = self.model.match_parent_path(
- slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
- )
- if first_slice_path is None or first_slice_path[-1] != matmul_qk:
- logger.debug("fuse_attention: failed to match first slice path")
- return None
-
- first_slice_sub = self.model.match_parent_path(
- slice_mask,
- ["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
- [1, 0, 0, 0, 0],
- )
- if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
- logger.debug("fuse_attention: failed to match last slice sub path")
- return None
-
- first_slice_sub_1 = self.model.match_parent_path(
- slice_mask,
- ["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
- [1, 0, 1, 0, 0],
- )
- if (
- first_slice_sub_1 is None
- or first_slice_sub_1[-1] != layernorm_before_attention
- ):
- logger.debug("fuse_attention: failed to match last slice sub path 1")
- return None
-
- return slice_mask.input[0]
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- past = None
- present = None
-
- qkv_nodes = self.model.match_parent_path(
- normalize_node,
- ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
- [0, 1, None, 0, 0, 0],
- output_name_to_node=output_name_to_node,
- ) # yapf: disable
- if qkv_nodes is None:
- return
- (
- add_skip,
- add_after_attention,
- matmul_after_attention,
- reshape_qkv,
- transpose_qkv,
- matmul_qkv,
- ) = qkv_nodes
-
- skip_input = add_skip.input[0]
-
- v_nodes = self.model.match_parent_path(
- matmul_qkv,
- [
- "Concat",
- "Transpose",
- "Reshape",
- "Split",
- "Add",
- "MatMul",
- "LayerNormalization",
- ],
- [1, 1, 0, 0, 0, None, 0],
- ) # yapf: disable
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
- (
- concat_v,
- transpose_v,
- reshape_v,
- split_v,
- add_before_split,
- matmul_before_split,
- layernorm_before_attention,
- ) = v_nodes
- if skip_input != layernorm_before_attention.input[0]:
- logger.debug(
- "fuse_attention: skip_input != layernorm_before_attention.input[0]"
- )
- return
-
- qk_nodes = self.model.match_parent_path(
- matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0]
- )
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return None
- (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes
- if self.model.get_node_attribute(softmax_qk, "axis") != 3:
- logger.debug("fuse_attention failed: softmax_qk axis != 3")
- return None
-
- attention_mask = self.match_mask(
- sub_qk, mul_qk, matmul_qk, layernorm_before_attention
- )
-
- q_nodes = self.model.match_parent_path(
- matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0]
- )
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
- (div_q, transpose_q, reshape_q, split_q) = q_nodes
- if split_v != split_q:
- logger.debug("fuse_attention: skip since split_v != split_q")
- return
-
- k_nodes = self.model.match_parent_path(
- matmul_qk,
- ["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
- [1, 0, 0, 1, 0, 0],
- )
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
- (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes
- if split_v != split_k:
- logger.debug("fuse_attention: skip since split_v != split_k")
- return
-
- i, value = self.model.get_constant_input(reshape_k)
- if not (
- isinstance(value, np.ndarray)
- and list(value.shape) == [4]
- and value[0] == 0
- and value[1] == 0
- and value[2] > 0
- and value[3] > 0
- ):
- logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
- return
-
- num_heads = value[2]
- if num_heads != self.num_heads:
- logger.info(
- f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}"
- )
- self.num_heads = num_heads
-
- hidden_size_per_head = value[3]
- i, value = self.model.get_constant_input(div_k)
- expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head)))
- if not is_close(value, expected_value):
- logger.debug(
- f"fuse_attention: div_k value={value} expected={expected_value}"
- )
- return
-
- i, value = self.model.get_constant_input(div_q)
- if not is_close(value, expected_value):
- logger.debug(
- f"fuse_attention: div_q value={value} expected={expected_value}"
- )
- return
-
- # Match past and present paths
- past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
- if past is None:
- logger.debug("fuse_attention: match past failed")
- return
- if not self.model.find_graph_input(past):
- logger.debug("fuse_attention: past is not graph input.")
- # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
-
- present = self.match_present(concat_v, input_name_to_nodes)
- if present is None:
- logger.debug("fuse_attention: match present failed")
- return
- if not self.model.find_graph_output(present):
- logger.info("fuse_attention: expect present to be graph output")
- return
-
- self.fuse_attention_node(
- matmul_before_split,
- add_before_split,
- past,
- present,
- layernorm_before_attention.output[0],
- reshape_qkv,
- attention_mask,
- )
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
deleted file mode 100644
index 4e538cf5833d096635e461eae34ab35edd20d3b1..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from logging import getLogger
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionGptAttentionNoPast(Fusion):
- """
- Fuse GPT-2 Attention without past state into one Attention node.
- This does not support attention_mask graph input right now.
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(
- model,
- "CustomQKVToContextPluginDynamic_IxRT",
- ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
- "without past",
- )
- self.where_qk_shared = None
-
- def get_num_heads_and_hidden_size(
- self, custom_fc: NodeProto, div: NodeProto
- ) -> Tuple[int, int]:
- div_initializer = self.model.get_initializer(div.input[1])
-
- # 检查float_data是否为空
- if len(div_initializer.float_data) > 0:
- div_value = div_initializer.float_data[0]
- else:
- # 如果float_data为空,尝试其他方式获取数据
- # 例如,如果数据存储在raw_data中
- if len(div_initializer.raw_data) > 0:
- dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
- div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
- else:
- raise ValueError("Data not found in the div_initializer")
-
- for attr in custom_fc.attribute:
- if attr.name == "W":
- tensor_value = attr.t
- tensor_shape = [dim for dim in tensor_value.dims]
- break
- head_dim = math.ceil(div_value * div_value)
- hidden_size = tensor_shape[1]
- num_heads = hidden_size // head_dim
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self,
- num_heads: int,
- hidden_size: int,
- input: str,
- output: str,
- where_qk: NodeProto,
- ) -> Union[NodeProto, None]:
-
- attention_node_name = self.model.create_node_name("Attention")
-
- attention_inputs = [input]
- if where_qk is not None:
- has_mask = 1
- has_qk_bias = 1
- attention_inputs.append(where_qk.output[0])
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend(
- [helper.make_attribute("has_qk_bias", has_qk_bias)]
- )
- return attention_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- return_indice = []
- add_qkv = normalize_node
- if normalize_node.op_type == "LayerNormalization":
- add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
- if add_before_layernorm is not None:
- add_qkv = add_before_layernorm
-
- qkv_paths = {
- "path1": (
- ["CustomFCPluginDynamic_IxRT", "Reshape", "Transpose", "MatMul"],
- [None, 0, 0, 0],
- ),
- "path2": (
- ["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"],
- [None, 0, 0],
- ),
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(
- add_qkv,
- qkv_paths,
- output_name_to_node,
- return_indice,
- ) # yapf: disable
-
- if qkv_nodes is None:
- return
- reshape_2 = None
- if qkv_path == "path1":
- (
- custom_fc_after_attention,
- reshape_2,
- transpose_qkv,
- matmul_qkv,
- ) = qkv_nodes
- else:
- (
- custom_fc_after_attention,
- transpose_qkv,
- matmul_qkv,
- ) = qkv_nodes
-
- another_input = add_qkv.input[1 - return_indice[0]]
-
- v_nodes = self.model.match_parent_path(
- matmul_qkv,
- ["Transpose", "Reshape", "Split", "CustomFCPluginDynamic_IxRT"],
- [1, 0, 0, 0],
- ) # yapf: disable
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
- (
- transpose_v,
- reshape_v,
- split_v,
- custom_fc_before_attention,
- ) = v_nodes
-
- layernorm_before_attention = self.model.get_parent(
- custom_fc_before_attention, 0, output_name_to_node
- )
- if (
- layernorm_before_attention is None
- or layernorm_before_attention.op_type != "LayerNormalization"
- ):
- if layernorm_before_attention.op_type != "Add":
- logger.debug(
- f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}"
- )
- return
-
- if not another_input in layernorm_before_attention.input:
- # match openai-gpt
- if not another_input in layernorm_before_attention.output:
- logger.debug("Add and LayerNormalization shall have one same input")
- return
-
- qk_nodes = self.model.match_parent_path(
- matmul_qkv, ["Softmax", "Add", "Where", "Div", "MatMul"], [0, None, 0, 1, 0]
- )
- where_qk = None
- matmul_qk = None
- mask_return_indices = []
- if qk_nodes is not None:
- (softmax_qk, add_qk, where_qk, div_qk, matmul_qk) = qk_nodes
- mask_nodes = self.model.match_parent_path(
- add_qk,
- ["Mul", "Sub", "Cast", "Unsqueeze"],
- [None, 0, 1, 0],
- return_indice=mask_return_indices,
- ) # yapf: disable
- if mask_nodes is None:
- logger.debug("fuse_attention: failed to match mask path")
- return
-
- q_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
- )
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
- (transpose_q, reshape_q, split_q) = q_nodes
- if split_v != split_q:
- logger.debug("fuse_attention: skip since split_v != split_q")
- return
-
- k_nodes = self.model.match_parent_path(
- matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0]
- )
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
- (transpose_k, reshape_k, split_k) = k_nodes
- if split_v != split_k:
- logger.debug("fuse_attention: skip since split_v != split_k")
- return
-
- if where_qk is None:
- return
-
- global num_heads, hidden_size
- if self.where_qk_shared is None:
- where_qk.input[1] = mask_nodes[0].output[0]
- div_qk.output[0] = where_qk.output[0]
- add_qk.input[1 - mask_return_indices[0]] = div_qk.output[0]
- self.where_qk_shared = where_qk
- self.nodes_to_remove.extend([softmax_qk, add_qk, div_qk, matmul_qk])
-
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(
- custom_fc_after_attention, div_qk
- )
- self.nodes_to_remove.extend([k_nodes[0]])
- self.nodes_to_remove.extend(v_nodes[:-2])
- else:
- self.nodes_to_remove.extend(
- [softmax_qk, add_qk, where_qk, div_qk, matmul_qk]
- )
- self.nodes_to_remove.extend(q_nodes)
- self.nodes_to_remove.extend(k_nodes)
- self.nodes_to_remove.extend(v_nodes[:-1])
-
- new_node = self.create_attention_node(
- num_heads,
- hidden_size,
- custom_fc_before_attention.output[0],
- transpose_qkv.output[0] if reshape_2 is None else reshape_2.output[0],
- self.where_qk_shared,
- )
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- if reshape_2 is not None:
- self.nodes_to_remove.extend([reshape_2])
- self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
deleted file mode 100644
index d19c3aff604ed6f3ae673ffa0c67143b66e36aaf..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict
-
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionLayerNormalization(Fusion):
- def __init__(self, model: OnnxModel, hidden_size):
- self.hidden_size = hidden_size
- super().__init__(model, "LayerNormalization", "ReduceMean")
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- Fuse Layer Normalization subgraph into one node LayerNormalization:
- +----------------------+
- | |
- | v
- [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
- (axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^
- | |
- +-----------------------------------------------+
-
- It also handles cases of duplicated sub nodes exported from older version of PyTorch:
- +----------------------+
- | v
- | +-------> Sub-----------------------------------------------+
- | | |
- | | v
- [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
- | ^
- | |
- +----------------------+
- """
- children = self.model.get_children(node, input_name_to_nodes)
- if len(children) == 0 or len(children) > 2:
- return
-
- root_input = node.input[0]
-
- if children[0].op_type != "Sub" or children[0].input[0] != root_input:
- return
-
- if len(children) == 2:
- if children[1].op_type != "Sub" or children[1].input[0] != root_input:
- return
-
- div_node = None
- for child in children:
- div_node = self.model.find_first_child_by_type(
- child, "Div", input_name_to_nodes, recursive=False
- )
- if div_node is not None:
- break
- if div_node is None:
- return
-
- path_id, parent_nodes, _ = self.model.match_parent_paths(
- div_node,
- [
- (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
- (
- ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
- [1, 0, 0, 0, 0, 0],
- ),
- ],
- output_name_to_node,
- )
- if path_id < 0:
- return
-
- sub_node = parent_nodes[-1]
- if sub_node not in children:
- return
-
- second_add_node = parent_nodes[1]
- i, add_weight = self.model.get_constant_input(second_add_node)
- if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
- logger.warning(f"epsilon value is not expeced: {add_weight}")
- return
-
- pow_node = parent_nodes[3]
- if not self.model.find_constant_input(pow_node, 2.0) == 1:
- return
-
- mul_node = input_name_to_nodes[div_node.output[0]][0]
- is_not_have_mul_and_add = False
- is_not_have_mul_and_add_lst_node = None
- # deal with special case : layernorm do not have mul and add
- if mul_node.op_type != "Mul" and mul_node.op_type == "MatMul":
- is_not_have_mul_and_add = True
- is_not_have_mul_and_add_lst_node = div_node
- elif mul_node.op_type != "Mul":
- return
-
- if is_not_have_mul_and_add:
- last_add_node = is_not_have_mul_and_add_lst_node
- if self.hidden_size == 0:
- print(
- "[Error] Please add '--hidden_size' and '--num_head' to fuse layernorm ..."
- )
- exit(0)
-
- subgraph_nodes = [node]
- subgraph_nodes.extend(children)
- subgraph_nodes.extend(parent_nodes[:-1])
- subgraph_nodes.extend([last_add_node])
- if len(subgraph_nodes) == 7:
- self.nodes_to_remove.extend(subgraph_nodes)
- else:
- return
-
- norm_name = self.model.create_node_name(
- "LayerNormalization", name_prefix="LayerNorm"
- )
- np_weights = np.ones((self.hidden_size)).astype(np.float32)
- np_weights_name = norm_name + "_weights"
- weights_tensor = helper.make_tensor(
- np_weights_name, TensorProto.FLOAT, np_weights.shape, np_weights
- )
- np_bias = np.zeros((self.hidden_size)).astype(np.float32)
- np_bias_name = norm_name + "_bias"
- bias_tensor = helper.make_tensor(
- np_bias_name, TensorProto.FLOAT, np_bias.shape, np_bias
- )
- self.model.add_initializer(weights_tensor)
- self.model.add_initializer(bias_tensor)
- normalize_node = helper.make_node(
- "LayerNormalization",
- inputs=[node.input[0], np_weights_name, np_bias_name],
- outputs=[last_add_node.output[0]],
- name=norm_name,
- )
- normalize_node.attribute.extend(
- [helper.make_attribute("epsilon", float(add_weight))]
- )
- self.nodes_to_add.append(normalize_node)
- self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
- else:
- last_add_node = input_name_to_nodes[mul_node.output[0]][0]
- if last_add_node.op_type != "Add":
- return
-
- subgraph_nodes = [node]
- subgraph_nodes.extend(children)
- subgraph_nodes.extend(parent_nodes[:-1])
-
- subgraph_nodes.extend([last_add_node, mul_node, div_node])
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- last_add_node.output,
- input_name_to_nodes,
- output_name_to_node,
- ):
- logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
- return
-
- weight_input = mul_node.input[
- 1 - self.model.input_index(div_node.output[0], mul_node)
- ]
- if not self.model.is_constant_with_specified_dimension(
- weight_input, 1, "layernorm weight"
- ):
- return
-
- bias_input = last_add_node.input[
- 1 - self.model.input_index(mul_node.output[0], last_add_node)
- ]
- if not self.model.is_constant_with_specified_dimension(
- bias_input, 1, "layernorm bias"
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- normalize_node = helper.make_node(
- "LayerNormalization",
- inputs=[node.input[0], weight_input, bias_input],
- outputs=[last_add_node.output[0]],
- name=self.model.create_node_name(
- "LayerNormalization", name_prefix="LayerNorm"
- ),
- )
- normalize_node.attribute.extend(
- [helper.make_attribute("epsilon", float(add_weight))]
- )
- self.nodes_to_add.append(normalize_node)
- self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionLayerNormalizationKeras(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "LayerNormalization", "GlobalAveragePool", "Keras layernorm"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- +-------------------------------+
- | |
- | v
- [Root] --> GlobalAveragePool--> Sub --> Mul --> GlobalAveragePool --> Add/Min/Max --> Sqrt --> Div --> Mul --> Add
- | ^
- | |
- +---------------------------------------------------------------+
- """
- children = self.model.get_children(node, input_name_to_nodes)
- # print(len(children))
- if len(children) != 1:
- return
-
- root_input = node.input[0]
-
- if children[0].op_type != "Sub" or children[0].input[0] != root_input:
- return
-
- div_node = None
- for child in children:
- div_node = self.model.find_first_child_by_type(
- child, "Div", input_name_to_nodes, recursive=False
- )
- if div_node is not None:
- break
- if div_node is None:
- return
- # print('div_node_name:', div_node.name)
- path_id, parent_nodes, _ = self.model.match_parent_paths(
- div_node,
- [
- (
- ["Sqrt", "Max", "Min", "Add", "GlobalAveragePool", "Mul", "Sub"],
- [1, 0, 0, 0, None, 0, None],
- ),
- ],
- output_name_to_node,
- )
- if path_id < 0:
- return
-
- sub_node = parent_nodes[-1]
- if sub_node not in children:
- return
-
- second_add_node = parent_nodes[3]
- i, add_weight = self.model.get_constant_input(second_add_node)
- if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
- logger.warning(f"epsilon value is not expeced: {add_weight}")
- return
-
- mul_node = input_name_to_nodes[div_node.output[0]][0]
- if mul_node.op_type != "Mul":
- return
-
- last_add_node = input_name_to_nodes[mul_node.output[0]][0]
- if last_add_node.op_type != "Add":
- return
-
- subgraph_nodes = [node]
- subgraph_nodes.extend(children)
- subgraph_nodes.extend(parent_nodes[:-1])
-
- subgraph_nodes.extend([last_add_node, mul_node, div_node])
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- last_add_node.output,
- input_name_to_nodes,
- output_name_to_node,
- ):
- logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
- return
-
- weight_input = mul_node.input[
- 1 - self.model.input_index(div_node.output[0], mul_node)
- ]
- if not self.model.is_constant_with_specified_dimension(
- weight_input, 1, "layernorm weight"
- ):
- return
-
- bias_input = last_add_node.input[
- 1 - self.model.input_index(mul_node.output[0], last_add_node)
- ]
- if not self.model.is_constant_with_specified_dimension(
- bias_input, 1, "layernorm bias"
- ):
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- normalize_node = helper.make_node(
- "LayerNormalization",
- inputs=[node.input[0], weight_input, bias_input],
- outputs=[last_add_node.output[0]],
- name=self.model.create_node_name(
- "LayerNormalization", name_prefix="LayerNorm"
- ),
- )
- normalize_node.attribute.extend(
- [helper.make_attribute("epsilon", float(add_weight))]
- )
- self.nodes_to_add.append(normalize_node)
- self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionLayerNormalizationTF(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "LayerNormalization", "Add", "TF")
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
- +------------------------------------+
- | |
- | |
- (Cast_1) |
- | |
- | v (B) (B) (A)
- Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
- | | | ^ ^
- | | | | |
- | +--------------------------------------------------(Cast_2)-------------------------------|-------+ |
- | v |
- +---------------------------------------------------------------------------------------------------------------> Mul--------------------+
- """
- return_indice = []
- _, parent_nodes, return_indice = self.model.match_parent_paths(
- node,
- [
- (
- [
- "Sub",
- "Mul",
- "Mul",
- "Reciprocal",
- "Sqrt",
- "Add",
- "ReduceMean",
- "Mul",
- "Sub",
- "ReduceMean",
- ],
- [1, 1, None, 0, 0, 0, None, 0, 0, None],
- ),
- (
- [
- "Sub",
- "Mul",
- "Mul",
- "Reciprocal",
- "Sqrt",
- "Add",
- "Cast",
- "ReduceMean",
- "Mul",
- "Sub",
- "ReduceMean",
- ],
- [1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
- ),
- ],
- output_name_to_node,
- ) # yapf: disable
-
- if parent_nodes is None:
- return
-
- assert len(return_indice) == 3
- if not (
- return_indice[0] in [0, 1]
- and return_indice[1] in [0, 1]
- and return_indice[2] in [0, 1]
- ):
- logger.debug(
- "return indice is exepected in [0, 1], but got {return_indice}"
- )
- return
-
- (
- sub_node_0,
- mul_node_0,
- mul_node_1,
- reciprocol_node,
- sqrt_node,
- add_node_0,
- ) = parent_nodes[:6]
- reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[
- -4:
- ]
-
- cast_node_3 = None
- if len(parent_nodes) == 11:
- cast_node_3 = parent_nodes[6]
- assert cast_node_3.op_type == "Cast"
-
- mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
- if mul_node_3 is None:
- logger.debug("mul_node_3 not found")
- return
-
- node_before_reduce = self.model.get_parent(
- reduce_mean_node_1, 0, output_name_to_node
- )
- root_node = (
- node_before_reduce
- if cast_node_3 is None
- else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
- )
- if root_node is None:
- logger.debug("root node is none")
- return
-
- i, epsilon = self.model.get_constant_input(add_node_0)
- if (
- epsilon is None
- or epsilon <= 0
- or (epsilon > 1.0e-5 and cast_node_3 is None)
- ):
- logger.debug("epsilon is not matched")
- return
-
- if cast_node_3 is None and (
- reduce_mean_node_1.input[0] not in mul_node_3.input
- or reduce_mean_node_1.input[0] not in sub_node_1.input
- ):
- logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
- return
-
- if cast_node_3 is not None and (
- node_before_reduce.input[0] not in mul_node_3.input
- or reduce_mean_node_1.input[0] not in sub_node_1.input
- ):
- logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
- return
-
- if mul_node_2.input[0] != mul_node_2.input[1]:
- logger.debug("mul_node_2 shall have two same inputs")
- return
-
- subgraph_nodes = [
- node,
- sub_node_0,
- mul_node_0,
- mul_node_1,
- reciprocol_node,
- sqrt_node,
- add_node_0,
- reduce_mean_node_0,
- mul_node_2,
- sub_node_1,
- reduce_mean_node_1,
- mul_node_3,
- ]
-
- if cast_node_3 is not None:
- cast_node_2 = self.model.match_parent(
- mul_node_0, "Cast", 0, output_name_to_node
- )
- if cast_node_2 is None:
- logger.debug("cast_node_2 not found")
- return
- subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
-
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- node.output,
- self.model.input_name_to_nodes(),
- self.model.output_name_to_node(),
- ):
- logger.debug("not safe to fuse layer normalization")
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
-
- weight_input = mul_node_1.input[1]
- bias_input = sub_node_0.input[0]
-
- # TODO: add epsilon attribute
- fused_node = helper.make_node(
- "LayerNormalization",
- inputs=[mul_node_3.input[0], weight_input, bias_input],
- outputs=[node.output[0]],
- name=self.model.create_node_name(
- "LayerNormalization", name_prefix="LayerNorm"
- ),
- )
- fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
deleted file mode 100644
index c0bb11b3bdd6bcbb994b8ad83501be2d9c1c4505..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from argparse import ArgumentParser
-
-
-class AttentionMaskFormat:
- MaskIndexEnd = 0
- MaskIndexEndAndStart = 1
- AttentionMask = 2
- NoMask = 3
-
-
-class FusionOptions:
- """Options of fusion in graph optimization"""
-
- def __init__(self, model_type):
- self.enable_gelu = True
- self.enable_layer_norm = True
- self.enable_attention = True
- self.enable_skip_layer_norm = True
- self.enable_embed_layer_norm = True
- self.enable_bias_skip_layer_norm = True
- self.enable_bias_gelu = True
- self.enable_gelu_approximation = False
- self.enable_qordered_matmul = True
-
- self.enable_shape_inference = True
- self.enable_swint_opt = False
- self.enable_format_roformer = False
- self.enable_gpt2_classify = False
- self.enable_vit = False
- self.enable_omdet = False
- self.attention_mask_format = AttentionMaskFormat.AttentionMask
-
- if model_type == "gpt2":
- self.enable_skip_layer_norm = False
- self.enable_gpt2_classify = True
- elif model_type == "swint":
- self.enable_swint_opt = True
- elif model_type == "roformer":
- self.enable_format_roformer = True
- elif model_type == "vit":
- self.enable_vit = True
- elif model_type == "omdet":
- self.enable_omdet = True
-
- def use_raw_attention_mask(self, use_raw_mask=True):
- if use_raw_mask:
- self.attention_mask_format = AttentionMaskFormat.AttentionMask
- else:
- self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd
-
- def disable_attention_mask(self):
- self.attention_mask_format = AttentionMaskFormat.NoMask
-
- @staticmethod
- def parse(args):
- options = FusionOptions(args.model_type)
- if args.disable_gelu:
- options.enable_gelu = False
- if args.disable_layer_norm:
- options.enable_layer_norm = False
- if args.disable_attention:
- options.enable_attention = False
- if args.disable_skip_layer_norm:
- options.enable_skip_layer_norm = False
- if args.disable_embed_layer_norm:
- options.enable_embed_layer_norm = False
- if args.disable_bias_skip_layer_norm:
- options.enable_bias_skip_layer_norm = False
- if args.disable_bias_gelu:
- options.enable_bias_gelu = False
- if args.enable_gelu_approximation:
- options.enable_gelu_approximation = True
- if args.disable_shape_inference:
- options.enable_shape_inference = False
- if args.use_mask_index:
- options.use_raw_attention_mask(False)
- if args.no_attention_mask:
- options.disable_attention_mask()
- return options
-
- @staticmethod
- def add_arguments(parser: ArgumentParser):
- parser.add_argument(
- "--disable_attention",
- required=False,
- action="store_true",
- help="disable Attention fusion",
- )
- parser.set_defaults(disable_attention=False)
-
- parser.add_argument(
- "--disable_skip_layer_norm",
- required=False,
- action="store_true",
- help="disable SkipLayerNormalization fusion",
- )
- parser.set_defaults(disable_skip_layer_norm=False)
-
- parser.add_argument(
- "--disable_embed_layer_norm",
- required=False,
- action="store_true",
- help="disable EmbedLayerNormalization fusion",
- )
- parser.set_defaults(disable_embed_layer_norm=False)
-
- parser.add_argument(
- "--disable_bias_skip_layer_norm",
- required=False,
- action="store_true",
- help="disable Add Bias and SkipLayerNormalization fusion",
- )
- parser.set_defaults(disable_bias_skip_layer_norm=False)
-
- parser.add_argument(
- "--disable_bias_gelu",
- required=False,
- action="store_true",
- help="disable Add Bias and Gelu/FastGelu fusion",
- )
- parser.set_defaults(disable_bias_gelu=False)
-
- parser.add_argument(
- "--disable_layer_norm",
- required=False,
- action="store_true",
- help="disable LayerNormalization fusion",
- )
- parser.set_defaults(disable_layer_norm=False)
-
- parser.add_argument(
- "--disable_gelu",
- required=False,
- action="store_true",
- help="disable Gelu fusion",
- )
- parser.set_defaults(disable_gelu=False)
-
- parser.add_argument(
- "--enable_gelu_approximation",
- required=False,
- action="store_true",
- help="enable Gelu/BiasGelu to FastGelu conversion",
- )
- parser.set_defaults(enable_gelu_approximation=False)
-
- parser.add_argument(
- "--disable_shape_inference",
- required=False,
- action="store_true",
- help="disable symbolic shape inference",
- )
- parser.set_defaults(disable_shape_inference=False)
-
- parser.add_argument(
- "--use_mask_index",
- required=False,
- action="store_true",
- help="use mask index instead of raw attention mask in attention operator",
- )
- parser.set_defaults(use_mask_index=False)
-
- parser.add_argument(
- "--no_attention_mask",
- required=False,
- action="store_true",
- help="no attention mask. Only works for model_type=bert",
- )
- parser.set_defaults(no_attention_mask=False)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
deleted file mode 100644
index 9afa3edbc37f2ddd7b15c3eb976ee1cd9e72e356..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple
-
-import numpy as np
-from onnx import NodeProto, helper
-
-from .fusion_attention import AttentionMask
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedAttention(Fusion):
- def __init__(
- self,
- model: OnnxModel,
- hidden_size: int,
- num_heads: int,
- attention_mask: AttentionMask,
- ):
- self.hidden_size = hidden_size
- self.num_heads = num_heads
- self.attention_mask = attention_mask
-
- super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")
-
- def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
- Args:
- reshape_q (NodeProto): reshape node for Q
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- q_shape = self.model.get_initializer(reshape_q.input[1])
- if q_shape is None:
- logger.debug(f"{reshape_q.input[1]} is not initializer.")
-
- # Check if the second input to Reshape flows through a Constant node
- # TODO: Investigate why FusionAttention doesn't have such logic
- constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])
-
- if constant_node is None:
- return (
- self.num_heads,
- self.hidden_size,
- ) # Fall back to user specified value
- else:
- constant_node = constant_node[0]
-
- if len(constant_node.attribute) != 1:
- return (
- self.num_heads,
- self.hidden_size,
- ) # Fall back to user specified value
-
- # This is assuming it is a Tensor attribute (this is a safe assumption)
- q_shape = constant_node.attribute[0].t
-
- q_shape_value = NumpyHelper.to_array(q_shape)
- if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
- logger.debug(
- f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- num_heads = q_shape_value[2]
- head_size = q_shape_value[3]
- hidden_size = num_heads * head_size
-
- if self.num_heads > 0 and num_heads != self.num_heads:
- if self.num_heads_warning:
- logger.warning(
- f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
- )
- self.num_heads_warning = False # Do not show the warning more than once
-
- if self.hidden_size > 0 and hidden_size != self.hidden_size:
- if self.hidden_size_warning:
- logger.warning(
- f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
- )
- self.hidden_size_warning = (
- False # Do not show the warning more than once
- )
-
- return num_heads, hidden_size
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- add_before_layernorm = self.model.match_parent_path(
- normalize_node,
- ["QuantizeLinear", "Add"],
- [0, 0],
- )
-
- if add_before_layernorm is not None:
- start_node = add_before_layernorm[-1]
- else:
- return
-
- # Input QDQ nodes
- dequantize_input = self.model.match_parent_path(
- start_node,
- ["DequantizeLinear"],
- [None],
- )
-
- if dequantize_input is None:
- logger.debug(
- "fuse_qordered_attention: failed to match input qdq nodes path"
- )
- return
-
- dequantize_input = dequantize_input[-1]
-
- # QKV nodes
- qkv_nodes = self.model.match_parent_path(
- start_node,
- [
- "Add",
- "MatMul",
- "Reshape",
- "Transpose",
- "DequantizeLinear",
- "QuantizeLinear",
- "MatMul",
- ],
- [None, None, 0, 0, 0, 0, 0],
- )
-
- if qkv_nodes is None:
- logger.debug("fuse_qordered_attention: failed to match qkv path")
- return
-
- (
- _,
- projection_matmul,
- reshape_qkv,
- transpose_qkv,
- dequantize_qkv,
- quantize_qkv,
- matmul_qkv,
- ) = qkv_nodes
-
- # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
- if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):
- return
-
- # Identify the root input to the Attention node
- other_inputs = []
- for i, input in enumerate(start_node.input):
- if input not in output_name_to_node:
- continue
-
- if input == qkv_nodes[0].output[0]:
- continue
-
- other_inputs.append(input)
-
- if len(other_inputs) != 1:
- return
-
- root_input = other_inputs[0]
-
- # V nodes
- v_nodes = self.model.match_parent_path(
- matmul_qkv,
- [
- "Transpose",
- "Reshape",
- "DequantizeLinear",
- "QuantizeLinear",
- "Add",
- "MatMul",
- ],
- [1, 0, 0, 0, 0, None],
- )
-
- if v_nodes is None:
- logger.debug("fuse_qordered_attention: failed to match v path")
- return
-
- (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes
-
- # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
- if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):
- return
-
- # V MatMul weight
- dequantize_v_matmul_weight = self.model.match_parent_path(
- matmul_v, ["DequantizeLinear"], [1]
- )
-
- if dequantize_v_matmul_weight is None:
- logger.debug("fuse_qordered_attention: failed to match v path")
- return
-
- dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]
-
- if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:
- return
-
- # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
- # Per-channel scales are supported for weights alone
- if not FusionUtils.check_qdq_node_for_fusion(
- dequantize_v_matmul_weight, self.model, False
- ):
- return
-
- # QK nodes
- qk_nodes = self.model.match_parent_path(
- matmul_qkv,
- [
- "DequantizeLinear",
- "QuantizeLinear",
- "Softmax",
- "Add",
- "Div",
- "DequantizeLinear",
- "QuantizeLinear",
- "MatMul",
- ],
- [0, 0, 0, 0, None, 0, 0, 0],
- )
-
- if qk_nodes is None:
- logger.debug("fuse_qordered_attention: failed to match qk path")
- return
-
- (
- dequantize_qk_softmax,
- quantize_qk_softmax,
- softmax_qk,
- add_qk,
- div_qk,
- dequantize_qk,
- quantize_qk,
- matmul_qk,
- ) = qk_nodes
-
- # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
- if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):
- return
-
- # Q nodes
- q_nodes = self.model.match_parent_path(
- matmul_qk,
- [
- "Transpose",
- "Reshape",
- "DequantizeLinear",
- "QuantizeLinear",
- "Add",
- "MatMul",
- ],
- [0, 0, 0, 0, 0, None],
- )
-
- if q_nodes is None:
- logger.debug("fuse_qordered_attention: failed to match q path")
- return
-
- (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes
-
- # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
- if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):
- return
-
- # Q MatMul weight
- dequantize_q_matmul_weight = self.model.match_parent_path(
- matmul_q, ["DequantizeLinear"], [1]
- )
-
- if dequantize_q_matmul_weight is None:
- logger.debug("fuse_qordered_attention: failed to match q path")
- return
-
- dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]
-
- if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:
- return
-
- # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
- # Per-channel scales are supported for weights alone
- if not FusionUtils.check_qdq_node_for_fusion(
- dequantize_q_matmul_weight, self.model, False
- ):
- return
-
- # K nodes
- k_nodes = self.model.match_parent_path(
- matmul_qk,
- [
- "Transpose",
- "Reshape",
- "DequantizeLinear",
- "QuantizeLinear",
- "Add",
- "MatMul",
- ],
- [1, 0, 0, 0, 0, None],
- )
-
- if k_nodes is None:
- logger.debug("fuse_qordered_attention: failed to match k path")
- return
-
- (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes
-
- # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
- if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):
- return
-
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):
- return
-
- # K MatMul weight
- dequantize_k_matmul_weight = self.model.match_parent_path(
- matmul_k, ["DequantizeLinear"], [1]
- )
-
- if dequantize_k_matmul_weight is None:
- logger.debug("fuse_qordered_attention: failed to match k path")
- return
-
- dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]
-
- if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:
- return
-
- # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
- # Per-channel scales are supported for weights alone
- if not FusionUtils.check_qdq_node_for_fusion(
- dequantize_k_matmul_weight, self.model, False
- ):
- return
-
- # Mask nodes
- mask_nodes = self.model.match_parent_path(
- add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]
- )
-
- if mask_nodes is None:
- logger.debug("fuse_qordered_attention: failed to match mask_nodes path")
- return
-
- # Ascertain `qkv_hidden_sizes` attribute value
- q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
- k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
- v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
-
- qw = NumpyHelper.to_array(q_weight)
- kw = NumpyHelper.to_array(k_weight)
- vw = NumpyHelper.to_array(v_weight)
-
- qw_out_size = np.prod(qw.shape[1:])
- kw_out_size = np.prod(kw.shape[1:])
- vw_out_size = np.prod(vw.shape[1:])
-
- # Form QOrderedAttention node
- if (
- matmul_v.input[0] == root_input
- and matmul_q.input[0] == root_input
- and matmul_k.input[0] == root_input
- ):
- mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
-
- # Ascertain `num_heads` and `hidden_size`
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
-
- # Formulate the inputs
- # Actual quantized input
- attention_inputs = [dequantize_input.input[0]]
- attention_inputs.append(dequantize_input.input[1])
-
- attention_inputs.append(dequantize_q.input[1])
- attention_inputs.append(dequantize_k.input[1])
- attention_inputs.append(dequantize_v.input[1])
-
- attention_inputs.append(dequantize_q_matmul_weight.input[0])
- attention_inputs.append(dequantize_k_matmul_weight.input[0])
- attention_inputs.append(dequantize_v_matmul_weight.input[0])
-
- attention_inputs.append(dequantize_q_matmul_weight.input[1])
- attention_inputs.append(dequantize_k_matmul_weight.input[1])
- attention_inputs.append(dequantize_v_matmul_weight.input[1])
-
- if self.model.get_initializer(add_q.input[0]):
- attention_inputs.append(add_q.input[0])
- else: # second input is the constant bias
- attention_inputs.append(add_q.input[1])
-
- if self.model.get_initializer(add_k.input[0]):
- attention_inputs.append(add_k.input[0])
- else: # second input is the constant bias
- attention_inputs.append(add_k.input[1])
-
- if self.model.get_initializer(add_v.input[0]):
- attention_inputs.append(add_v.input[0])
- else: # second input is the constant bias
- attention_inputs.append(add_v.input[1])
-
- attention_inputs.append(quantize_qk.input[1])
- attention_inputs.append(quantize_qk_softmax.input[1])
- attention_inputs.append(dequantize_qkv.input[1])
-
- # Mask input
- if mask_index is not None:
- attention_inputs.append(mask_index)
- else:
- attention_inputs.append("")
-
- # The MatMul weight 'B' and 'bias' need some post-processing
- # Transpose weight 'B' from order ROW to order COL
- # This offline transpose is needed only while using the CUDA EP
- # TODO: Make this fusion logic EP-agnostic ?
- q_weight_tensor = self.model.get_initializer(
- dequantize_q_matmul_weight.input[0]
- )
- FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)
-
- k_weight_tensor = self.model.get_initializer(
- dequantize_k_matmul_weight.input[0]
- )
- FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)
-
- v_weight_tensor = self.model.get_initializer(
- dequantize_v_matmul_weight.input[0]
- )
- FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)
-
- # Name and create Attention node
- attention_node_name = self.model.create_node_name("QOrderedAttention")
-
- attention_node = helper.make_node(
- "QOrderedAttention",
- inputs=attention_inputs,
- outputs=[reshape_qkv.output[0]],
- name=attention_node_name,
- )
-
- self.model.replace_node_input(
- dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0]
- )
- self.model.replace_node_input(
- projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0]
- )
-
- attention_node.attribute.extend(
- [helper.make_attribute("num_heads", num_heads)]
- )
- attention_node.attribute.extend([helper.make_attribute("order_input", 1)])
- attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])
- attention_node.attribute.extend([helper.make_attribute("order_output", 1)])
- attention_node.attribute.extend(
- [
- helper.make_attribute(
- "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
- )
- ]
- )
-
- attention_node.domain = "com.microsoft"
-
- self.nodes_to_add.append(attention_node)
- self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-
- self.nodes_to_remove.extend(
- [reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv]
- )
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes)
- self.nodes_to_remove.extend(k_nodes)
- self.nodes_to_remove.extend(v_nodes)
- self.nodes_to_remove.extend(
- [
- dequantize_q_matmul_weight,
- dequantize_k_matmul_weight,
- dequantize_v_matmul_weight,
- ]
- )
-
- # Use prune graph to remove mask nodes since they are shared by all attention nodes.
- # self.nodes_to_remove.extend(mask_nodes)
- self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
deleted file mode 100644
index ebd165c4bc5da002eb53b2376c1e69facf40dec4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedGelu(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"])
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- INPUT PATTERN
- Fuse (quantized) Gelu subgraph into one node QOrderedGelu:
- -> quantized input -> DQ -> Gelu -> Q ->
-
- (or)
-
- -> quantized input -> DQ -> FastGelu -> Q ->
-
- OUTPUT PATTERN
- -> QOrderedGelu ->
- """
- gelu_children = self.model.get_children(node, input_name_to_nodes)
-
- # Should only have 1 child - QuantizeLinear (or)
- # Should have 2 children - QuantizeLinear + Shape
- if not (
- (len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear")
- or (
- len(gelu_children) == 2
- and gelu_children[0].op_type == "QuantizeLinear"
- and gelu_children[1].op_type == "Shape"
- )
- ):
- return
-
- downstream_quantize_node = gelu_children[0]
- downstream_shape_node = None
-
- if len(gelu_children) == 2:
- downstream_shape_node = gelu_children[1]
-
- if not FusionUtils.check_qdq_node_for_fusion(
- downstream_quantize_node, self.model
- ):
- return
-
- # The first input to Gelu should flow through a DequantizeLinear node
- first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
- node,
- [(["DequantizeLinear"], [0])],
- output_name_to_node,
- )
-
- if first_path_id < 0:
- return
-
- upstream_dequantize_node = first_input_parent_nodes[0]
-
- if not FusionUtils.check_qdq_node_for_fusion(
- upstream_dequantize_node, self.model
- ):
- return
-
- # Fusion logic
- subgraph_nodes = [node] # Gelu/FastGelu
- subgraph_nodes.extend(
- [downstream_quantize_node, upstream_dequantize_node]
- ) # Relevant Q, DQ nodes
-
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- [node.output[0], downstream_quantize_node.output[0]]
- if downstream_shape_node is not None
- else downstream_quantize_node.output,
- input_name_to_nodes,
- output_name_to_node,
- ):
- logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
-
- ordered_gelu_node = helper.make_node(
- "QOrderedGelu",
- inputs=[
- upstream_dequantize_node.input[0],
- upstream_dequantize_node.input[1],
- downstream_quantize_node.input[1],
- ],
- outputs=[downstream_quantize_node.output[0]],
- name=self.model.create_node_name(
- "QOrderedGelu", name_prefix="QOrderedGelu"
- ),
- )
-
- # Arrange the downstream Shape's input to be fed from the
- # downstream QuantizeLinear node, so that fusion will
- # be deemed safe
- if downstream_shape_node is not None:
- self.model.replace_node_input(
- downstream_shape_node,
- downstream_shape_node.input[0],
- downstream_quantize_node.output[0],
- )
-
- # TODO: We only support CuBlasLt order ORDER_ROW for now.
- # Once we start supporting other data ordering format(s), we
- # will support user configuring the data ordering for the op.
- ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)])
- ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)])
-
- ordered_gelu_node.domain = "com.microsoft"
-
- self.nodes_to_add.append(ordered_gelu_node)
- self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
deleted file mode 100644
index 94e38a0f5b549cb217359926172eb4aa510ad68b..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedLayerNormalization(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization")
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization:
- quantized input -> DQ
- |
- |
- (other inputs)-> LayerNormalization --> Q -->
-
- should become
-
- (quantized input + other inputs)-> QOrderedLayerNormalization --> Q -->
- """
-
- children = self.model.get_children(node, input_name_to_nodes)
-
- # Should only have 1 child - QuantizeLinear (or)
- # Should have 2 children - QuantizeLinear + Shape
- if not (
- (len(children) == 1 and children[0].op_type == "QuantizeLinear")
- or (
- len(children) == 2
- and children[0].op_type == "QuantizeLinear"
- and children[1].op_type == "Shape"
- )
- ):
- return
-
- downstream_quantize_node = children[0]
- downstream_shape_node = None
-
- if len(children) == 2:
- downstream_shape_node = children[1]
-
- if not FusionUtils.check_qdq_node_for_fusion(
- downstream_quantize_node, self.model
- ):
- return
-
- # The first input to LayerNormalization should flow through a DequantizeLinear node
- first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
- node,
- [(["DequantizeLinear"], [0])],
- output_name_to_node,
- )
-
- if first_path_id < 0:
- return
-
- upstream_dequantize_node = first_input_parent_nodes[0]
-
- if not FusionUtils.check_qdq_node_for_fusion(
- upstream_dequantize_node, self.model
- ):
- return
-
- # Fusion logic
- subgraph_nodes = [node] # LayerNormalization
- subgraph_nodes.extend(
- [downstream_quantize_node]
- ) # Q node after LayerNormalization
-
- upstream_dequantize_node_children = self.model.get_children(
- upstream_dequantize_node, input_name_to_nodes
- )
-
- # In GPT2, the DQ node will be feeding a residual downstream Add and hence,
- # we do not want to remove it
- if len(upstream_dequantize_node_children) == 1:
- subgraph_nodes.extend(
- [upstream_dequantize_node]
- ) # DQ node before LayerNormalization
-
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- [node.output[0], downstream_quantize_node.output[0]]
- if downstream_shape_node is not None
- else downstream_quantize_node.output,
- input_name_to_nodes,
- output_name_to_node,
- ):
- logger.debug(
- f"It is not safe to fuse QOrderedLayerNormalization node. Skip"
- )
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
-
- normalize_node = helper.make_node(
- "QOrderedLayerNormalization",
- inputs=[
- upstream_dequantize_node.input[0],
- upstream_dequantize_node.input[1],
- node.input[1],
- node.input[2],
- downstream_quantize_node.input[1],
- ],
- outputs=[downstream_quantize_node.output[0]],
- name=self.model.create_node_name(
- "QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization"
- ),
- )
-
- # Arrange the downstream Shape's input to be fed from the
- # downstream QuantizeLinear node, so that fusion will
- # be deemed safe
- if downstream_shape_node is not None:
- self.model.replace_node_input(
- downstream_shape_node,
- downstream_shape_node.input[0],
- downstream_quantize_node.output[0],
- )
-
- # TODO: We only support CuBlasLt order ORDER_ROW for now.
- # Once we start supporting other data ordering format(s), we
- # will support user configuring the data ordering for the op.
- normalize_node.attribute.extend([helper.make_attribute("order_X", 1)])
- normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)])
-
- normalize_node.domain = "com.microsoft"
-
- self.nodes_to_add.append(normalize_node)
- self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
deleted file mode 100644
index 8c8050e1cdfb0061b734b1224aa0006b1c09cdef..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedMatMul(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "QOrderedMatMul", "MatMul")
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- matmul_children = self.model.get_children(node, input_name_to_nodes)
-
- # Should only have 1 child - Bias Add
- if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
- return
-
- bias_add_node = matmul_children[0]
-
- # Atleast one of the inputs to Bias Add node must be a constant
- bias_add_node_index = 0
- if (
- self.model.get_constant_value(bias_add_node.input[0]) is None
- and self.model.get_constant_value(bias_add_node.input[1]) is None
- ):
- return
-
- if self.model.get_constant_value(bias_add_node.input[0]) is None:
- bias_add_node_index = 1
-
- bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)
-
- if len(bias_add_children) != 1:
- return
-
- bias_add_child = bias_add_children[0]
-
- # Bias Add can have another Add downstream (Residual Add layer)
- residual_add_node = None
-
- downstream_quantize_node = None
-
- if bias_add_child.op_type == "Add":
- residual_add_node = bias_add_child
-
- residual_add_children = self.model.get_children(
- residual_add_node, input_name_to_nodes
- )
-
- if (
- len(residual_add_children) != 1
- or residual_add_children[0].op_type != "QuantizeLinear"
- ):
- return
-
- downstream_quantize_node = residual_add_children[0]
-
- elif bias_add_child.op_type == "QuantizeLinear":
- downstream_quantize_node = bias_add_child
-
- else:
- return
-
- # Make sure the downstream QuantizeLinear has the proper zero points and scales
- if not FusionUtils.check_qdq_node_for_fusion(
- downstream_quantize_node, self.model
- ):
- return
-
- # The first input to MatMul should flow through a DequantizeLinear node
- first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
- node,
- [(["DequantizeLinear"], [0])],
- output_name_to_node,
- )
-
- # If Attention is not fused, this is the pattern to look for
- # leading upto the MatMul
- reshape_node_0 = None
- transpose_node_0 = None
- if first_path_id < 0:
- first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
- node,
- [
- (
- ["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"],
- [0, 0, 0, 0],
- )
- ],
- output_name_to_node,
- )
-
- if first_path_id < 0:
- return
-
- reshape_node_0 = first_input_parent_nodes[0]
- transpose_node_0 = first_input_parent_nodes[1]
- dequantize_node_0 = first_input_parent_nodes[2]
- else:
- dequantize_node_0 = first_input_parent_nodes[0]
-
- # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
- if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
- return
-
- # The second input to MatMul should flow through a DequantizeLinear node
- dequantize_node_1 = None
- is_weight_transpose_required = True
-
- weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
- node,
- [
- (
- [
- "DequantizeLinear",
- "QuantizeLinear",
- "Transpose",
- "DequantizeLinear",
- ],
- [1, 0, 0, 0],
- )
- ],
- output_name_to_node,
- )
-
- if weight_path_id < 0:
- weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
- node,
- [(["DequantizeLinear"], [1])],
- output_name_to_node,
- )
-
- if weight_path_id < 0:
- return
-
- dequantize_node_1 = weight_nodes[0]
- else:
- is_weight_transpose_required = False
- dequantize_node_1 = weight_nodes[3]
-
- # Check if weight 'B' is a constant
- if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
- return
-
- # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
- # Per-channel scales are supported for weights alone
- if not FusionUtils.check_qdq_node_for_fusion(
- dequantize_node_1, self.model, False
- ):
- return
-
- # Make sure the upstream flow into the Residual Add node flows through a DQ node
- residual_add_dequantize_node = None
-
- if residual_add_node is not None:
- (
- residual_path_id,
- residual_input_parent_nodes,
- _,
- ) = self.model.match_parent_paths(
- residual_add_node,
- [
- (["DequantizeLinear"], [1]),
- ],
- output_name_to_node,
- )
-
- if residual_path_id < 0:
- return
-
- residual_add_dequantize_node = residual_input_parent_nodes[0]
-
- # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
- if (
- residual_add_dequantize_node is not None
- and not FusionUtils.check_qdq_node_for_fusion(
- residual_add_dequantize_node, self.model
- )
- ):
- return
-
- # Subgraph nodes to be fused
- subgraph_nodes = [node, bias_add_node] # MatMul + Bias Add
-
- if residual_add_node is not None:
- subgraph_nodes.extend([residual_add_node]) # Residual Add
-
- subgraph_nodes.extend(weight_nodes)
- subgraph_nodes.extend([downstream_quantize_node]) # Downstream Q node
-
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes,
- downstream_quantize_node.output,
- input_name_to_nodes,
- output_name_to_node,
- ):
- logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
- return
-
- # Deal with the case where-in the Attention subgraph is not fused
- if transpose_node_0 is not None:
- self.model.replace_node_input(
- transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0]
- )
-
- # Make inputs
- fused_node_inputs = [
- reshape_node_0.output[0]
- if reshape_node_0 is not None
- else dequantize_node_0.input[0],
- dequantize_node_0.input[1],
- dequantize_node_1.input[0],
- dequantize_node_1.input[1],
- downstream_quantize_node.input[1],
- bias_add_node.input[bias_add_node_index],
- ]
-
- if residual_add_node is not None:
- fused_node_inputs.append(residual_add_dequantize_node.input[0])
- fused_node_inputs.append(residual_add_dequantize_node.input[1])
-
- # The MatMul weight 'B' and 'bias' need some post-processing
- # Transpose weight 'B' from order ROW to order COL
- # This offline transpose is needed only while using the CUDA EP
- # TODO: Make this fusion logic EP-agnostic ?
- if is_weight_transpose_required:
- weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
- FusionUtils.transpose_2d_int8_tensor(weight_tensor)
-
- fused_node = helper.make_node(
- "QOrderedMatMul",
- inputs=fused_node_inputs,
- outputs=[downstream_quantize_node.output[0]],
- name=self.model.create_node_name(
- "QOrderedMatMul", name_prefix="QOrderedMatMul"
- ),
- )
-
- fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
- fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
- fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])
-
- fused_node.domain = "com.microsoft"
-
- self.nodes_to_remove.extend(subgraph_nodes)
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
deleted file mode 100644
index 2a5bf73fdf07f223be18e7bbaf20f9623ebb3fdc..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-import numpy as np
-from onnx import TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionReshape(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "Reshape", "Reshape")
- self.prune_graph: bool = False
-
- def replace_reshape_node(self, shape, reshape_node, concat_node):
- shape_value = np.asarray([int(x) if isinstance(x, np.ndarray) else x for x in shape], dtype=np.int64)
- constant_shape_name = self.model.create_node_name("Constant", "constant_shape")
- new_node = helper.make_node(
- "Constant",
- inputs=[],
- outputs=[constant_shape_name],
- value=helper.make_tensor(
- name="const_tensor",
- data_type=TensorProto.INT64,
- dims=shape_value.shape,
- vals=bytes(shape_value),
- raw=True,
- ),
- )
- reshape_node.input[1] = constant_shape_name
- reshape_node.name = self.model.create_node_name("Reshape", "Reshape_Fuse")
- self.nodes_to_remove.extend([concat_node])
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
- if reshape_node.input[1] not in output_name_to_node:
- return
-
- concat_node = output_name_to_node[reshape_node.input[1]]
- if (
- concat_node.op_type != "Concat"
- or len(concat_node.input) < 3
- or len(concat_node.input) > 4
- ):
- return
-
- path0 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Gather", "Shape"],
- [0, 0, 0],
- output_name_to_node,
- )
- if path0 is None:
- return
-
- (unsqueeze_0, gather_0, shape_0) = path0
-
- path1 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Gather", "Shape"],
- [1, 0, 0],
- output_name_to_node,
- )
- if path1 is None:
- return
- (unsqueeze_1, gather_1, shape_1) = path1
-
- shape = []
- gather_value = self.model.get_constant_value(gather_0.input[1])
- if gather_value == 0:
- shape.append(0)
-
- gather_value = self.model.get_constant_value(gather_1.input[1])
- if gather_value == 1:
- shape.append(0)
-
- if len(shape) != 2:
- return
-
- path2 = []
- path3 = []
- shape_nodes = [shape_0, shape_1]
- if (
- len(concat_node.input) == 3
- and self.model.get_initializer(concat_node.input[2]) is None
- ):
- path2 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Mul", "Gather", "Shape"],
- [2, 0, 0, 0],
- output_name_to_node,
- )
- if path2 is None:
- path2 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
- [2, 0, 0, 0, 0],
- output_name_to_node,
- ) # GPT2 exported by PyTorch 1.4 with opset_version=11
- if path2 is None:
- return
-
- path3 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Mul", "Gather", "Shape"],
- [2, 0, 1, 0],
- output_name_to_node,
- )
- if path3 is None:
- path3 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
- [2, 0, 1, 0, 0],
- output_name_to_node,
- ) # GPT2 exported by PyTorch 1.4 with opset_version=11
- if path3 is None:
- return
-
- shape_nodes.extend([path2[-1], path3[-1]])
- shape.append(-1)
- elif len(concat_node.input) > 2:
- concat_2 = self.model.get_initializer(concat_node.input[2])
- if concat_2 is None:
- return
- concat_value = numpy_helper.to_array(concat_2)
- if isinstance(concat_value, list):
- shape.extend(concat_value)
- else:
- shape.append(concat_value)
-
- if (
- len(concat_node.input) == 4
- and self.model.get_initializer(concat_node.input[3]) is None
- ):
- if -1 in shape:
- return
-
- path2 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Div", "Gather", "Shape"],
- [3, 0, 0, 0],
- output_name_to_node,
- )
- if path2 is None:
- path2 = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Div", "Squeeze", "Slice", "Shape"],
- [3, 0, 0, 0, 0],
- output_name_to_node,
- ) # GPT2 exported by PyTorch 1.4 with opset_version=11
- if path2 is None:
- return
- shape_nodes.extend([path2[-1]])
- shape.append(-1)
- elif len(concat_node.input) > 3:
- concat_3 = self.model.get_initializer(concat_node.input[3])
- if concat_3 is None:
- return
-
- concat_value = numpy_helper.to_array(concat_3)
- if isinstance(concat_value, list):
- shape.extend(concat_value)
- else:
- shape.append(concat_value)
-
- root_input = reshape_node.input[0]
- same_shape_input = True
- for shape_node in shape_nodes:
- if shape_node.input[0] != root_input:
- same_shape_input = False
-
- if not same_shape_input:
- return
-
- self.replace_reshape_node(shape, reshape_node, concat_node)
-
- # TODO(tlwu): Subgraph blocks pruning un-used nodes. Add code to remove un-used nodes safely.
- self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
deleted file mode 100644
index b3ec51a5a25af26a36ef9fc0015b80104e4cd67f..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-import logging
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = logging.getLogger(__name__)
-
-
-class FusionRMSNorm(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "RMSNorm", "Mul")
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- if node.op_type != "Mul":
- return
-
- sim_ln_nodes = None
- # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary):
- # DD = Pow(D, 2)
- # Var = ReduceMean(DD)
- # VarEps = Add(Var, epsilon)
- # StdDev = Sqrt(VarEps)
- # InvStdDev = Div(1, StdDev)
- # Normalized = Mul(D, InvStdDev)
- # NormalizedScaled = Mul(Normalized, Scale)
-
- # RMSNorm
- # +-------------------------------------------------------+
- # | |
- # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
- # |
- # node
- sim_ln_nodes_1 = self.model.match_parent_path(
- node,
- ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
- [1, 1, 1, 0, 0, 0, 0],
- )
- # RMSNorm
- # +-------------------------------------------------------+
- # | |
- # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
- # |
- # node
- sim_ln_nodes_2 = self.model.match_parent_path(
- node,
- ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
- [1, 1, 1, 0, 0, 0, 0],
- )
-
- # For LLaMA from Microsoft custom export:
- # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1
- #
- # RMSNorm
- # +-------------------------------------------------------+
- # | |
- # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
- # |
- # node
- sim_ln_nodes_3 = self.model.match_parent_path(
- node,
- ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
- [0, 1, 1, 0, 0, 0, 0],
- )
-
- # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3
- #
- # RMSNorm
- # +-----------------------------------------------+
- # | |
- # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul
- # |
- # node
- sim_ln_nodes_4 = self.model.match_parent_path(
- node,
- ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"],
- [0, 1, 1, 0, 0, 0],
- )
-
- add_node, pow_node = None, None
- if sim_ln_nodes_1 is not None:
- sim_ln_nodes = sim_ln_nodes_1
- add_node = sim_ln_nodes[3]
- pow_node = sim_ln_nodes[-2]
- elif sim_ln_nodes_2 is not None:
- sim_ln_nodes = sim_ln_nodes_2
- add_node = sim_ln_nodes[3]
- pow_node = sim_ln_nodes[-2]
- elif sim_ln_nodes_3 is not None:
- sim_ln_nodes = sim_ln_nodes_3
- add_node = sim_ln_nodes[3]
- pow_node = sim_ln_nodes[-2]
- elif sim_ln_nodes_4 is not None:
- sim_ln_nodes = sim_ln_nodes_4
- add_node = sim_ln_nodes[3]
- pow_node = sim_ln_nodes[-1]
- # Verify that parent input to Pow node is graph_input
- if pow_node.input[0] not in self.model.get_graphs_input_names():
- return
- else:
- return
-
- layernorm_weight_index = (
- 1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0
- )
- starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4
-
- if self.model.find_constant_input(pow_node, 2.0) != 1:
- return
-
- root_input = pow_node.input[0]
- if root_input != sim_ln_nodes[0].input[0]:
- return
-
- i, add_weight = self.model.get_constant_input(add_node)
- if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
- logger.warning(f"epsilon value is not expected: {add_weight}")
- return
-
- self.nodes_to_remove.extend(
- sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes
- )
- self.nodes_to_remove.append(node)
-
- normalize_node = helper.make_node(
- "RMSNormPluginDynamic_IxRT",
- inputs=[root_input, node.input[layernorm_weight_index]],
- outputs=[node.output[0]],
- name=self.model.create_node_name(
- "RMSNormPluginDynamic_IxRT", name_prefix="RMSNorm_"
- ),
- )
-
- normalize_node.domain = "com.iluvatar"
- normalize_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- normalize_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- normalize_node.attribute.extend(
- [helper.make_attribute("epsilon", float(add_weight))]
- )
- normalize_node.attribute.extend([helper.make_attribute("axis", -1)])
- normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
- gamma_data = self.model.get_initializer(normalize_node.input[1])
- gamma_data_np = NumpyHelper.to_array(gamma_data)
- normalize_node.attribute.extend(
- [helper.make_attribute("hidden_size", int(gamma_data_np.shape[0]))]
- )
-
- normalize_node.attribute.extend([helper.make_attribute("gamma", gamma_data)])
-
- self.nodes_to_add.append(normalize_node)
- self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
- return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
deleted file mode 100644
index 1d99595e8e8d9dc1cde4da1c66f266251d0919ca..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionRoformerCrossAttention(Fusion):
- """
- Fuse VideoBertAttention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQkvCrossToContext_IxRT",
- ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(
- self, custom_fc: NodeProto, mul: NodeProto
- ) -> Tuple[int, int]:
- mul_initializer = self.model.get_initializer(mul.input[1])
-
- # 检查float_data是否为空
- if len(mul_initializer.float_data) > 0:
- mul_value = mul_initializer.float_data[0]
- else:
- # 如果float_data为空,尝试其他方式获取数据
- # 例如,如果数据存储在raw_data中
- if len(mul_initializer.raw_data) > 0:
- dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
- mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
- else:
- raise ValueError("Data not found in the mul_initializer")
-
- for attr in custom_fc.attribute:
- if attr.name == "W":
- tensor_value = attr.t
- tensor_shape = [dim for dim in tensor_value.dims]
- break
- head_dim = math.floor(1.0 / (mul_value * mul_value))
- hidden_size = tensor_shape[0]
- num_heads = hidden_size // head_dim
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self,
- num_heads: int,
- hidden_size: int,
- input_q: str,
- input_k: str,
- input_v: str,
- input_mask: str,
- output: str,
- matmul_qk_add: NodeProto,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input_q: str,
- input_k: str,
- input_v: str,
- input_mask: str,
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- attention_node_name = self.model.create_node_name("CrossAttention")
-
- attention_inputs = [input_q, input_k, input_v, input_mask]
-
- attention_node = helper.make_node(
- "CustomQkvCrossToContext_IxRT",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
- attention_node.attribute.extend([helper.make_attribute("type_mask", 4)]) #3:float mask 4:int32 mask
- attention_node.attribute.extend([helper.make_attribute("scale", 1.0 / 8)]) #1 /sqrt(num_heads)
-
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-
- return attention_node
-
- def get_shape(self, edge_name):
- for info in self.model.graph().value_info:
- if info.name == edge_name:
- return info.type.tensor_type.shape.dim
- return None
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
-
- # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
- qkv_paths = {
- "path1": (
- [
- "CustomFCPluginDynamic_IxRT",
- "Reshape",
- "Transpose",
- "Reshape",
- "MatMul",
- ],
- [0, 0, 0, 0, 0],
- ),
- "path2": (
- [
- "CustomFCPluginDynamic_IxRT",
- "Reshape",
- "Transpose",
- "Reshape",
- "MatMul",
- ],
- [1, 0, 0, 0, 0],
- ),
- }
- # print('start_nodes:', start_node.name)
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
-
- fc_after_atten = None
- if qkv_path in ["path1", "path2"]:
- (
- fc_after_atten,
- reshape_qkv_2,
- transpose_qkv,
- reshape_qkv_1,
- matmul_qkv,
- ) = qkv_nodes
-
- """
- Match
- Add --> LayerNormalization --> Attention --> Add --> LayerNormalization
- | |
- | |
- +---------------------------------------------------------
- """
- add_before_layernorm = self.model.match_parent(start_node, "Add", None)
- if add_before_layernorm is not None:
- node_children = input_name_to_nodes[add_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- v_paths = {"path1": (["Reshape", "Transpose", "Reshape"], [1, 0, 0])}
-
- v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
- if v_path == "path1":
- (reshape_v, transpose_v, v_reshape) = v_nodes
-
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
-
- qk_paths = {
- "path1": (
- ["Softmax", "Add", "Mul", "Mul", "Reshape", "MatMul"],
- [0, 0, None, None, None, 0],
- )
- }
-
- qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return
- # print('qk_nodes', qk_nodes[0].name)
- matmul_qk_add = None
- if qk_path == "path1":
- (_, add_mask, mul_mask, mul_qk, reshape_qk, matmul_qk) = qk_nodes
-
- q_paths = {
- "path1": (["Transpose", "Add"], [0, 0]),
- }
- q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
- # print('q_nodes', q_nodes[0].name)
- if q_path == "path1":
- (q_tranpose, q_add) = q_nodes
-
- k_paths = {
- "path1": (["Reshape", "Transpose", "Add"], [1, 0, 0]),
- }
- k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
- # print('k_nodes', k_nodes[0].name)
- if k_path == "path1":
- (_, k_transpose, k_add) = k_nodes
- # print('add_mask', add_mask.name)
- mask_paths = {
- "path1": (
- ["Mul", "Sub", "Unsqueeze", "Cast", "Greater"],
- [1, None, 1, 0, 0],
- )
- }
- mask_nodes, mask_path = self.match_parent_path_from_dict(add_mask, mask_paths)
-
- if mask_nodes is None:
- logger.debug("fuse_attention: failed to match mask path")
- return
- # print('mask_nodes', mask_nodes[0].name)
- (_, mask_sub, mask_unsqueeze, mask_cast, mask_greater) = mask_nodes
-
- if (
- self.get_shape(q_add.output[0]) == self.get_shape(k_add.output[0])
- and self.get_shape(k_add.output[0]) == self.get_shape(v_reshape.output[0])
- and mul_mask.input[1] in mask_unsqueeze.output
- ):
- attention_last_node = reshape_qkv_1
-
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(
- fc_after_atten, mul_qk
- )
-
- q_transpose_type = None
- q_transpose_name = None
- for info in self.model.graph().value_info:
- if info.name == q_tranpose.output[0]:
- q_transpose_type = info.type
- q_transpose_name = info.name
- break
-
- q_transpose_output = helper.make_value_info(
- q_transpose_name[:-2] + "_fake_q", q_transpose_type
- )
- q_transpose_node = helper.make_node(
- "Transpose",
- inputs=[q_add.output[0]],
- outputs=[q_transpose_output.name],
- name=q_transpose_output.name,
- )
- q_transpose_node.attribute.extend(
- [helper.make_attribute("perm", [0, 2, 1, 3])]
- )
-
- k_transpose_output = helper.make_value_info(
- q_transpose_name[:-2] + "_fake_k", q_transpose_type
- )
- k_transpose_node = helper.make_node(
- "Transpose",
- inputs=[k_add.output[0]],
- outputs=[k_transpose_output.name],
- name=k_transpose_output.name,
- )
- k_transpose_node.attribute.extend(
- [helper.make_attribute("perm", [0, 2, 1, 3])]
- )
-
- v_transpose_output = helper.make_value_info(
- q_transpose_name[:-2] + "_fake_v", q_transpose_type
- )
- v_transpose_node = helper.make_node(
- "Transpose",
- inputs=[v_reshape.output[0]],
- outputs=[v_transpose_output.name],
- name=v_transpose_output.name,
- )
- v_transpose_node.attribute.extend(
- [helper.make_attribute("perm", [0, 2, 1, 3])]
- )
-
- mask_type = None
- for info in self.model.graph().value_info:
- if info.name == mask_sub.output[0]:
- mask_type = info.type
- break
-
- new_mask_type = onnx.TypeProto()
- new_mask_type.tensor_type.elem_type = onnx.TensorProto.INT32
- for dim in mask_type.tensor_type.shape.dim:
- new_dim = new_mask_type.tensor_type.shape.dim.add()
- new_dim.CopyFrom(dim)
-
- mask_cast_to_int32_output = helper.make_value_info(
- mask_sub.name + "_cast_to_int32", new_mask_type
- )
- mask_cast_to_int32_node = helper.make_node(
- "Cast",
- inputs=[mask_sub.output[0]],
- outputs=[mask_cast_to_int32_output.name],
- name=mask_cast_to_int32_output.name,
- )
- mask_cast_to_int32_node.attribute.extend([helper.make_attribute("to", 6)])
-
- new_node = self.create_attention_node(
- num_heads,
- hidden_size,
- q_transpose_node.output[0],
- k_transpose_node.output[0],
- v_transpose_node.output[0],
- mask_cast_to_int32_node.output[0],
- attention_last_node.output[0],
- matmul_qk_add,
- )
- if new_node is None:
- return
-
- self.nodes_to_add.extend(
- [
- q_transpose_node,
- k_transpose_node,
- v_transpose_node,
- new_node,
- mask_cast_to_int32_node,
- ]
- )
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
- self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name
- self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name
- self.node_name_to_graph_name[v_transpose_node.name] = self.this_graph_name
- self.node_name_to_graph_name[
- mask_cast_to_int32_node.name
- ] = self.this_graph_name
-
- self.nodes_to_remove.extend(qkv_nodes[3:])
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes[:-1])
- self.nodes_to_remove.extend(k_nodes[:-1])
- self.nodes_to_remove.extend(v_nodes[:-1])
- self.nodes_to_remove.extend([mask_nodes[0]])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
deleted file mode 100644
index dfa14d0e25951f7ce72c719c452ebb56232e14a7..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionRoPE(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "CustomRoPEPluginDynamic_IxRT", "Add")
-
- def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
- src_paths = {"path1": (["Mul", "Concat", "Split", "Slice"], [0, 1, None, 0])}
- src_nodes, src_path = self.match_parent_path_from_dict(start_node, src_paths)
- if src_nodes is None:
- logger.debug("fuse_rope: failed to match src_node")
- return
-
- src_node = src_nodes[0]
-
- rotate_paths = {"path1": (["Mul", "Reshape", "Concat"], [1, 0, 0])}
- rotate_nodes, rotate_path = self.match_parent_path_from_dict(
- start_node, rotate_paths
- )
-
- if rotate_nodes is None:
- logger.debug("fuse_rope: failed to match rotate_path")
- return
-
- concat_node = rotate_nodes[-1]
- mul_right_node = rotate_nodes[0]
-
- odd_paths = {"path1": (["Unsqueeze", "Neg", "Slice", "Reshape"], [0, 0, 0, 0])}
- odd_nodes, odd_path = self.match_parent_path_from_dict(concat_node, odd_paths)
-
- if odd_nodes is None:
- logger.debug("fuse_rope: failed to match odd_path")
- return
-
- even_paths = {"path1": (["Unsqueeze", "Slice", "Reshape"], [1, 0, 0])}
- even_nodes, even_path = self.match_parent_path_from_dict(
- concat_node, even_paths
- )
-
- if even_nodes is None:
- logger.debug("fuse_rope: failed to match even_path")
- return
- reshape_node = even_nodes[-1]
-
- if reshape_node.output[0] == src_node.input[0]:
- rope_node_name = self.model.create_node_name("RoPE")
- rope_node = helper.make_node(
- "CustomRoPEPluginDynamic_IxRT",
- inputs=[
- reshape_node.output[0],
- src_nodes[0].input[1],
- mul_right_node.input[1],
- ],
- outputs=[start_node.output[0]],
- name=rope_node_name,
- )
- rope_node.domain = "com.iluvatar"
- rope_node.attribute.extend([helper.make_attribute("type_id", 2)])
- rope_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- rope_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-
- self.nodes_to_add.append(rope_node)
- self.node_name_to_graph_name[rope_node.name] = self.this_graph_name
-
- self.nodes_to_remove.extend([start_node])
- self.nodes_to_remove.extend([src_nodes[0]])
- self.nodes_to_remove.extend(rotate_nodes)
- self.nodes_to_remove.extend(odd_nodes[:-1])
- self.nodes_to_remove.extend(even_nodes[:-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
deleted file mode 100644
index 727d4b82d44805f6d52c8e7fd72d94acf846e73e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict, List, Union
-
-from onnx import NodeProto, TensorProto
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionShape(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(model, "Shape", "Concat")
- self.utils = FusionUtils(model)
- self.shape_infer = None
- self.shape_infer_done = False
-
- def get_dimensions_from_tensor_proto(
- self, tensor_proto: TensorProto
- ) -> Union[int, None]:
- if tensor_proto.type.tensor_type.HasField("shape"):
- return len(tensor_proto.type.tensor_type.shape.dim)
- else:
- return None
-
- def get_dimensions(self, input_name: str) -> Union[int, None]:
- graph_input = self.model.find_graph_input(input_name)
- if graph_input:
- return self.get_dimensions_from_tensor_proto(graph_input)
-
- if not self.shape_infer_done:
- self.shape_infer = self.model.infer_runtime_shape({}, update=True)
- self.shape_infer_done = True
-
- if self.shape_infer is not None:
- return self.get_dimensions_from_tensor_proto(
- self.shape_infer.known_vi_[input_name]
- )
-
- return None
-
- def fuse(
- self,
- concat_node: NodeProto,
- input_name_to_nodes: Dict[str, List[NodeProto]],
- output_name_to_node: Dict[str, NodeProto],
- ):
- """
- Smplify subgraph like
-
- (2d_input)
- / \
- Shape shape
- / \
- Gather(indices=0) Gather(indices=1)
- | |
- Unsqueeze(axes=0) Unsqueeze(axes=0)
- \ /
- Concat
- |
-
- into (2d_input) --> Shape -->
- """
- opset_version = self.model.get_opset_version()
-
- inputs = len(concat_node.input)
- root = None
- shape_output = None
- for i in range(inputs):
- path = self.model.match_parent_path(
- concat_node,
- ["Unsqueeze", "Gather", "Shape"],
- [i, 0, 0],
- output_name_to_node,
- )
- if path is None:
- return
-
- unsqueeze, gather, shape = path
- if i == 0:
- shape_output = shape.output[0]
- if root is None:
- root = shape.input[0]
- if self.get_dimensions(root) != inputs:
- return
- elif shape.input[0] != root:
- return
-
- if not FusionUtils.check_node_attribute(
- unsqueeze, "axis", 0, default_value=0
- ):
- return
-
- if opset_version < 13:
- if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
- return
- else:
- if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
- return
-
- value = self.model.get_constant_value(gather.input[1])
- from numpy import array_equal, ndarray
-
- if not (
- isinstance(value, ndarray) and value.size == 1 and value.item() == i
- ):
- return
-
- if self.model.find_graph_output(concat_node.output[0]) is None:
- self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output)
- self.fused_count += 1
- self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
deleted file mode 100644
index d0797b26dc6edfabd91f4bd9d07d0c1da383ef8b..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSkipLayerNormalization(Fusion):
- """
- Fuse Add + LayerNormalization into one node: SkipLayerNormalization
- Note: This fusion does not check the input shape of Add and LayerNormalization.
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"
- )
- # Update shape inference is needed since other fusions might add new edge which does not have shape info yet.
- self.shape_infer_helper = self.model.infer_runtime_shape(
- {"batch_size": 4, "seq_len": 7}, update=True
- )
-
- if self.shape_infer_helper is None:
- # TODO(tianleiwu): support subgraph in shape inference or add broadcasting in SkipLayerNormalization op.
- logger.warning("symbolic shape inference disabled or failed.")
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- add = self.model.get_parent(node, 0, output_name_to_node)
-
- # In some models there is input_ids->gather->add->LayerNorm and one of input of the
- # add node is initializer with fixed shape which should not be fused into SkipLayerNorm
- if add is None:
- return
-
- for add_input in add.input:
- if self.model.get_initializer(add_input) != None:
- return
-
- # The number of input node of add should be 2
- if len(self.model.get_parents(add)) != 2:
- return
-
- if self.shape_infer_helper is not None:
- if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]):
- logger.debug(
- "skip SkipLayerNormalization fusion since shape of inputs (%s, %s) are not same",
- add.input[0],
- add.input[1],
- )
- return
- else:
- layernorm_weight = self.model.get_initializer(node.input[1])
- if layernorm_weight is not None:
- layernorm_weight_arr = NumpyHelper.to_array(layernorm_weight)
- hidden_size = layernorm_weight_arr.shape[0]
- else:
- logger.debug(
- "skip SkipLayerNormalization fusion since symbolic shape inference failed"
- )
- return
-
- # gather_path = self.model.match_parent_path(add, ["Gather"], [None])
- # if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
- # if self.model.match_parent_path(gather_path[0], ["ConstantOfShape"], [1]) is None:
- # return
-
- if (
- add is not None
- and add.op_type == "Add"
- and self.model.is_safe_to_fuse_nodes(
- [add, node], node.output, input_name_to_nodes, output_name_to_node
- )
- ):
- self.nodes_to_remove.extend([add, node])
-
- inputs = [add.input[0], add.input[1]]
- normalize_node = helper.make_node(
- "CustomSkipLayerNormPluginDynamic_IxRT",
- inputs=inputs,
- outputs=[node.output[0]],
- name=self.model.create_node_name(
- "SkipLayerNormalization", name_prefix="SkipLayerNorm"
- ),
- )
- normalize_node.domain = "com.iluvatar"
- if self.shape_infer_helper is not None:
- hidden_size = self.shape_infer_helper.get_edge_shape(node.input[1])[-1]
- normalize_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
- normalize_node.attribute.extend([helper.make_attribute("type_id", 2)])
- normalize_node.attribute.extend(
- [
- helper.make_attribute(
- "beta", self.model.get_initializer(node.input[2])
- )
- ]
- )
- normalize_node.attribute.extend(
- [
- helper.make_attribute(
- "gamma", self.model.get_initializer(node.input[1])
- )
- ]
- )
- normalize_node.attribute.extend(
- [helper.make_attribute("plugin_namespace", "")]
- )
- normalize_node.attribute.extend(
- [helper.make_attribute("plugin_version", "1")]
- )
-
- self.nodes_to_add.append(normalize_node)
- self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionBiasSkipLayerNormalization(Fusion):
- def __init__(self, model: OnnxModel):
- super().__init__(
- model,
- "CustomSkipLayerNormPluginDynamic_IxRT",
- "SkipLayerNormalization",
- "add bias",
- )
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
- if len(node.input) != 4:
- return
-
- return_indice = []
- nodes = self.model.match_parent_path(
- node, ["Add", "MatMul"], [None, None], None, return_indice
- )
- if nodes is None:
- return
- assert len(return_indice) == 2
- add_input_index = return_indice[0]
- if add_input_index >= 2:
- return
-
- (add, matmul) = nodes
-
- # bias should be one dimension
- bias_index = -1
- for i, input in enumerate(add.input):
- initializer = self.model.get_initializer(input)
- if initializer is None:
- continue
- bias_index = i
- bias_weight = NumpyHelper.to_array(initializer)
- break
- if bias_weight is None:
- logger.debug(f"Bias weight not found")
- return
- if len(bias_weight.shape) != 1:
- logger.debug(f"Bias weight is not 1D")
- return
-
- subgraph_nodes = [node, add]
- if not self.model.is_safe_to_fuse_nodes(
- subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
- ):
- logger.debug(
- f"Skip fusing SkipLayerNormalization with Bias since it is not safe"
- )
- return
-
- self.nodes_to_remove.extend(subgraph_nodes)
- inputs = [
- node.input[1 - add_input_index],
- matmul.output[0],
- node.input[2],
- node.input[3],
- add.input[bias_index],
- ]
- new_node = helper.make_node(
- "CustomSkipLayerNormPluginDynamic_IxRT",
- inputs=inputs,
- outputs=node.output,
- name=self.model.create_node_name(
- "SkipLayerNormalization", "SkipLayerNorm_AddBias_"
- ),
- )
- new_node.domain = "com.iluvatar"
- hidden_size = self.shape_infer_helper.get_edge_shape(node.input[2])[-1]
- new_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
- new_node.attribute.extend([helper.make_attribute("type_id", 2)])
- new_node.attribute.extend(
- [helper.make_attribute("beta", self.model.get_initializer(node.input[3]))]
- )
- new_node.attribute.extend(
- [helper.make_attribute("gamma", self.model.get_initializer(node.input[2]))]
- )
- new_node.attribute.extend(
- [
- helper.make_attribute(
- "bias", self.model.get_initializer(add.input[bias_index])
- )
- ]
- )
- new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
deleted file mode 100644
index 436257c3ce09b25790b132b6f918afebc63d9380..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSplitQKV(Fusion):
- """
- Fuse FusionSplitQKV
- """
-
- def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
- super().__init__(model, "SplitQKV_IxRT", "MatMul")
-
- self.hidden_size = hidden_size
- self.num_heads = num_heads
-
- def create_splitqkv_node(
- self, input: str, query_out: str, key_out: str, value_out: str
- ) -> Union[NodeProto, None]:
- """Create an XSoftmax node.
-
- Args:
- data_input (str): data input name
- mask_input (str): max input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- node_name = self.model.create_node_name("SplitQKV_IxRT")
-
- new_node = helper.make_node(
- "SplitQKV_IxRT",
- inputs=[input],
- outputs=[query_out, key_out, value_out],
- name=node_name,
- )
- new_node.domain = "com.iluvatar"
- new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- new_node.attribute.extend(
- [helper.make_attribute("atten_scale", 1 / self.num_heads)]
- )
-
- return new_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- split_query_paths = {
- "query_path": (
- ["Div", "Transpose", "Reshape", "Slice", "CustomFCPluginDynamic_IxRT"],
- [0, 0, 0, 0, 0],
- ),
- }
-
- split_key_paths = {
- "key_path": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
- }
-
- q_nodes, q_path = self.match_parent_path_from_dict(node, split_query_paths)
-
- k_nodes, k_path = self.match_parent_path_from_dict(node, split_key_paths)
-
- if (q_nodes is not None) and (k_nodes is not None):
- (
- q_div_node,
- q_transpose_node,
- q_reshape_node,
- q_slice_node,
- coustom_fc_node,
- ) = q_nodes
- k_transpose_node, k_reshape_node, k_slice_node = k_nodes
- slice_nodes = self.model.get_children(coustom_fc_node)
-
- if len(slice_nodes) != 3:
- return
- slice_nodes.remove(q_slice_node)
- slice_nodes.remove(k_slice_node)
- v_slice_node = slice_nodes[0]
-
- node.input[0] = q_div_node.input[0] # dele div
- new_node = self.create_splitqkv_node(
- coustom_fc_node.output[0],
- q_slice_node.output[0],
- k_slice_node.output[0],
- v_slice_node.output[0],
- )
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
- self.nodes_to_remove.append(q_slice_node)
- self.nodes_to_remove.append(k_slice_node)
- self.nodes_to_remove.append(v_slice_node)
- self.nodes_to_remove.append(q_div_node)
-
- else:
- return
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py
deleted file mode 100644
index 4152eef6e6371dd4da27b5315bf5bd741d0749d1..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSplitQKVUpdateKVCache(Fusion):
- """
- Fuse FusionSplitQKVUpdateKVCache
- """
-
- def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
- super().__init__(
- model, "SplitQKVUpdateKVCache_IxRT", "CustomQkvCrossToContext_IxRT"
- )
-
- self.hidden_size = hidden_size
- self.num_heads = num_heads
-
- def create_node(
- self,
- inputs: list,
- outputs: list,
- ) -> Union[NodeProto, None]:
- """Create an XSoftmax node.
-
- Args:
- data_input (str): data input name
- mask_input (str): max input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- node_name = self.model.create_node_name("SplitQKVUpdateKVCache_IxRT")
-
- new_node = helper.make_node(
- "SplitQKVUpdateKVCache_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=node_name,
- )
- new_node.domain = "com.iluvatar"
- new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)])
- new_node.attribute.extend(
- [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)]
- )
-
- return new_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- query_paths = {
- "query_path": (
- ["Transpose", "Reshape", "Split"],
- [0, 0, None],
- ),
- }
-
- key_paths = {
- "key_path": (
- ["Concat", "Transpose", "Reshape", "Split"],
- [1, None, 0, None],
- ),
- }
-
- value_paths = {
- "value_path": (
- ["Concat", "Transpose", "Reshape", "Split"],
- [2, None, 0, None],
- ),
- }
-
- q_nodes, q_path = self.match_parent_path_from_dict(node, query_paths)
-
- k_nodes, k_path = self.match_parent_path_from_dict(node, key_paths)
-
- v_nodes, v_path = self.match_parent_path_from_dict(node, value_paths)
-
- if (q_nodes is not None) and (k_nodes is not None) and (v_nodes is not None):
- (q_transpose_node, q_reshape_node, q_split_node) = q_nodes
- (k_concat_node, k_transpose_node, k_reshape_node, k_split_node) = k_nodes
-
- (v_concat_node, v_transpose_node, v_reshape_node, v_split_node) = v_nodes
-
- inputs = [
- q_split_node.input[0],
- k_concat_node.input[0],
- v_concat_node.input[0],
- ]
-
- outputs = [
- q_transpose_node.output[0],
- k_concat_node.output[0],
- v_concat_node.output[0],
- ]
-
- new_node = self.create_node(inputs, outputs)
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
- self.nodes_to_remove.append(q_transpose_node)
- self.nodes_to_remove.append(q_reshape_node)
- self.nodes_to_remove.append(q_split_node)
-
- self.nodes_to_remove.append(k_concat_node)
- self.nodes_to_remove.append(k_transpose_node)
- self.nodes_to_remove.append(k_reshape_node)
-
- self.nodes_to_remove.append(v_concat_node)
- self.nodes_to_remove.append(v_transpose_node)
- self.nodes_to_remove.append(v_reshape_node)
-
- else:
- return
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
deleted file mode 100644
index e446a69a636ed38e6e869a15ba6196d727b6d855..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import List, Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-def get_tensor_attr(attrs, attr_name):
- result = None
- for i in attrs:
- if i.name == attr_name:
- return numpy_helper.to_array(i.t)
- return result
-
-
-class FusionSwinLAttention(Fusion):
- """
- Fuse SwinL subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQKVToContextPluginDynamic_IxRT",
- ["CustomFCPluginDynamic_IxRT"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(self, reshape_v: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- v_shape = self.model.get_initializer(reshape_v.input[1])
- if v_shape is None:
- logger.debug(f"{reshape_v.input[1]} is not initializer.")
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- v_shape_value = NumpyHelper.to_array(v_shape)
- if len(v_shape_value) != 3 or (v_shape_value[1] <= 0 or v_shape_value[2] <= 0):
- logger.debug(
- f"v_shape_value={v_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return self.num_heads, self.hidden_size # Fall back to user specified value
-
- num_heads = 1
- for value_info in self.model.graph().value_info:
- if value_info.name == reshape_v.input[0]:
- num_heads = value_info.type.tensor_type.shape.dim[2].dim_value
- break
- hidden_size = v_shape_value[2]
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self,
- num_heads: int,
- hidden_size: int,
- inputs: List[str],
- output: str,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- attention_node_name = self.model.create_node_name("Attention")
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
- return attention_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- self.fuse_pattern1(normalize_node, input_name_to_nodes, output_name_to_node)
- self.fuse_pattern2(normalize_node, input_name_to_nodes, output_name_to_node)
-
- def fuse_pattern2(self, normalize_node, input_name_to_nodes, output_name_to_node):
- """match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC"""
- logger.debug("fuse swin-L attention pass")
- # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
- start_node = normalize_node
- qkv_paths = {
- "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
- }
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
- assert qkv_path == "path1", "abnormal qkv path"
- reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
-
- # 2. MatMul as start, go up to find v path
- v_paths = {
- "path1": (
- ["Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
- [None, 0, 0],
- )
- }
- v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
- if not v_nodes:
- logger.debug("fuse_attention: failed to match v path")
- return
- assert v_path == "path1", "abnormal v path"
-
- # 3. MatMul as start, go up to find q,k paths
- # q path
- q_paths = {
- "path1": (
- [
- "Softmax",
- "Add",
- "Div",
- "MatMul",
- "Transpose",
- "Reshape",
- "CustomFCPluginDynamic_IxRT",
- ],
- [None, 0, 0, 0, 0, 0, 0],
- ),
- }
- q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
- if not q_nodes:
- logger.debug("fuse_attention: failed to match q path")
- return
- assert q_path == "path1", "abnormal q paths found"
-
- # get Add(bias) input name as fused Attention inputs
- add_op, div_op = q_nodes[1], q_nodes[2]
- relative_position_bias_name = (
- add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
- )
-
- # k path
- k_paths = {
- "path2": (
- [
- "Softmax",
- "Add",
- "Div",
- "MatMul",
- "Transpose",
- "Reshape",
- "CustomFCPluginDynamic_IxRT",
- ],
- [None, 0, 0, 0, 1, 0, 0],
- )
- }
- k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
- if not k_nodes:
- logger.debug("fuse_attention: failed to match k path")
- return
- assert k_path == "path2", "abnormal k paths found"
- # 4. Fuse 3 CustomFC into one, and fuse attention
- # Fuse FCs
- fc_nodes = [q_nodes[-1], k_nodes[-1], v_nodes[-1]]
- weight = self.fuse_tensor_in_node_attrs(
- fc_nodes, "W", q_nodes[-1].name + "_Weight"
- )
- bias = self.fuse_tensor_in_node_attrs(fc_nodes, "B", q_nodes[-1].name + "_Bias")
- fused_node = helper.make_node(
- "CustomFCPluginDynamic_IxRT",
- inputs=[q_nodes[-1].input[0]],
- outputs=q_nodes[-1].output,
- name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend(
- [helper.make_attribute("out_dims", numpy_helper.to_array(bias).shape[0])]
- )
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- fused_node.attribute.extend([helper.make_attribute("W", weight)])
- fused_node.attribute.extend([helper.make_attribute("B", bias)])
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- self.nodes_to_add.append(fused_node)
-
- # Fuse Attention
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
- attention_node = self.create_attention_node(
- num_heads,
- hidden_size,
- [fused_node.output[0], relative_position_bias_name],
- reshape_qkv.output[0],
- )
- if not attention_node:
- return
- self.nodes_to_add.append(attention_node)
- self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
- self.nodes_to_remove.extend(
- [*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes]
- )
- self.prune_graph = True
-
- def fuse_pattern1(self, normalize_node, input_name_to_nodes, output_name_to_node):
- """match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC"""
- logger.debug("fuse swin-L attention pass")
- # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
- start_node = normalize_node
- qkv_paths = {
- "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
- }
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
- assert qkv_path == "path1", "abnormal qkv path"
- reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
-
- # 2. MatMul as start, go up to find v path
- v_paths = {
- "path1": (
- ["Transpose", "Reshape", "Add", "Split", "MatMul"],
- [None, 0, 0, None, 0],
- )
- }
- v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
- if not v_nodes:
- logger.debug("fuse_attention: failed to match v path")
- return
- assert v_path == "path1", "abnormal v path"
-
- # 3. MatMul as start, go up to find q,k paths
- # q path
- q_paths = {
- "path1": (
- [
- "Softmax",
- "Add",
- "Div",
- "MatMul",
- "Transpose",
- "Reshape",
- "Add",
- "Split",
- "MatMul",
- ],
- [None, 0, 0, 0, 0, 0, 0, None, 0],
- ),
- }
- q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
- if not q_nodes:
- logger.debug("fuse_attention: failed to match q path")
- return
- assert q_path == "path1", "abnormal q paths found"
-
- # get Add(bias) input name as fused Attention inputs
- add_op, div_op = q_nodes[1], q_nodes[2]
- relative_position_bias_name = (
- add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
- )
-
- # k path
- k_paths = {
- "path2": (
- [
- "Softmax",
- "Add",
- "Div",
- "MatMul",
- "Transpose",
- "Reshape",
- "Add",
- "Split",
- "MatMul",
- ],
- [None, 0, 0, 0, 1, 0, 0, None, 0],
- )
- }
- k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
- if not k_nodes:
- logger.debug("fuse_attention: failed to match k path")
- return
- assert k_path == "path2", "abnormal k paths found"
- # 4. Attention and CustomFC have been found, now transform the found nodes to two plugin nodes
- # Test 3 paths have the same origin
- is_same_origin = q_nodes[-1] is k_nodes[-1] is v_nodes[-1]
- is_same_origin &= q_nodes[-2] is k_nodes[-2] is v_nodes[-2]
- is_same_origin &= q_nodes[-3] is not k_nodes[-2] is not v_nodes[-3]
- if not is_same_origin:
- print("swin-L fuse_attention: found qkv path but not has the same origin")
- return
- origin_matmul = q_nodes[-1]
- fc_add = [q_nodes[-3], k_nodes[-3], v_nodes[-3]]
- # Now fuse
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
-
- # Fuse FC
- weight = self.model.get_initializer(origin_matmul.input[1])
- biases = [self.model.get_initializer(i.input[0]) for i in fc_add]
- if not weight or not all(biases):
- print("swin-L: couldn't find weights")
- return
- weight_arr = onnx.numpy_helper.to_array(weight).transpose(1, 0)
- weight.CopyFrom(numpy_helper.from_array(weight_arr))
- bias_arr = np.concatenate(
- [onnx.numpy_helper.to_array(i) for i in biases], axis=0
- )
-
- fused_node = helper.make_node(
- "CustomFCPluginDynamic_IxRT",
- inputs=[origin_matmul.input[0]],
- outputs=fc_add[0].output,
- name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
- )
- fused_node.domain = "com.iluvatar"
- fused_node.attribute.extend(
- [helper.make_attribute("out_dims", bias_arr.shape[0])]
- )
- fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
- fused_node.attribute.extend([helper.make_attribute("W", weight)])
- fused_node.attribute.extend(
- [helper.make_attribute("B", numpy_helper.from_array(bias_arr))]
- )
- fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
- self.nodes_to_add.append(fused_node)
- # Fuse Attention
- attention_node = self.create_attention_node(
- num_heads,
- hidden_size,
- [fused_node.output[0], relative_position_bias_name],
- reshape_qkv.output[0],
- )
- if not attention_node:
- return
- self.nodes_to_add.append(attention_node)
- self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
- self.nodes_to_remove.extend(
- [*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes]
- )
- self.prune_graph = True
-
- def fuse_tensor_in_node_attrs(self, fc_nodes, attr_name, tensor_name):
- result = [get_tensor_attr(i.attribute, attr_name) for i in fc_nodes]
- result = np.concatenate(result, axis=0)
- result = numpy_helper.from_array(result, tensor_name)
- return result
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
deleted file mode 100644
index bce0ab1713f20a19533e5793c4888607a7619c81..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
+++ /dev/null
@@ -1,495 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionT5EncoderAttention(Fusion):
- """
- Fuse T5Attention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQKVToContextPluginDynamic_IxRT",
- ["CustomSkipLayerNormPluginDynamic_IxRT", "RMSNormPluginDynamic_IxRT"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- q_shape = self.model.get_initializer(reshape_q.input[1])
- if q_shape is None:
- logger.debug(f"{reshape_q.input[1]} is not initializer.")
- return [0, 0]
-
- q_shape_value = NumpyHelper.to_array(q_shape)
- if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
- logger.debug(
- f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return [0, 0]
-
- num_heads = q_shape_value[2]
- head_size = q_shape_value[3]
- hidden_size = num_heads * head_size
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self,
- num_heads: int,
- hidden_size: int,
- input: str,
- output: str,
- matmul_qk_add: NodeProto,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- attention_node_name = self.model.create_node_name("Attention")
-
- qk_bias = None
- has_mask = 0
- has_qk_bias = 0
- add_input_is_value = False
- if matmul_qk_add is not None:
- has_qk_bias = 1
- qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
- if qk_bias:
- add_input_is_value = True
- qk_bias_arr = NumpyHelper.to_array(qk_bias)
- if len(qk_bias_arr.shape) == 3:
- qk_bias_arr = qk_bias_arr.squeeze(0)
- has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
- if np.any(has_neg_inf):
- qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
- np.float32
- )
- qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
-
- attention_inputs = [input]
-
- # 如果add的输入不是值,而是一个边,那么这个边的值需要cast到fp32
- cast_node = None
- if not add_input_is_value:
- cast_out_name = attention_node_name + "_fp32_in1"
- cast_out_tensor = helper.make_tensor_value_info(
- cast_out_name, TensorProto.FLOAT, [None, None, None, None]
- )
- # self.model.add_initializer(cast_out_name)
- cast_node = helper.make_node(
- "Cast",
- inputs=[matmul_qk_add.input[1]],
- outputs=[cast_out_tensor.name],
- name=self.model.create_node_name("Cast"),
- to=1,
- )
- self.node_name_to_graph_name[cast_node.name] = self.this_graph_name
- attention_inputs.append(cast_out_name)
-
- if has_qk_bias:
- if add_input_is_value:
- has_mask = 1
- attention_inputs.append(qk_bias.name)
- else:
- has_mask = 1
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend(
- [helper.make_attribute("has_qk_bias", has_qk_bias)]
- )
- attention_node.attribute.extend([helper.make_attribute("is_t5_mode", 1)])
-
- return attention_node, cast_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
- if normalize_node.op_type == "RMSNormPluginDynamic_IxRT":
- add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
- if add_before_layernorm is not None:
- start_node = add_before_layernorm
-
- # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
- qkv_paths = {
- "path1": (["MatMul", "Reshape", "Transpose", "MatMul"], [0, 0, 0, 0]),
- "path2": (["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0]),
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
-
- if qkv_path in ["path1", "path2"]:
- (atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
-
- other_inputs = []
- for i, input in enumerate(start_node.input):
- if input not in output_name_to_node:
- continue
-
- if input == qkv_nodes[0].output[0]:
- continue
- other_inputs.append(input)
- if len(other_inputs) != 1:
- return
-
- root_input = other_inputs[0]
- """
- Match T5
- Add/Gather --> LayerNormalization --> Attention --> Add --> LayerNormalization
- | |
- | |
- +---------------------------------------------------
- """
- transpose_before_layernorm = self.model.match_parent(start_node, "Gather", 0)
- if transpose_before_layernorm is not None:
- node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
- root_input = child.output[0]
-
- add_before_layernorm = self.model.match_parent(start_node, "Add", None)
- if add_before_layernorm is not None:
- node_children = input_name_to_nodes[add_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
- root_input = child.output[0]
-
- v_paths = {
- "path1": (
- ["Transpose", "Reshape", "Split", "MatMul"],
- [1, 0, 0, None],
- ) # T5
- }
-
- v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
- if v_path == "path1":
- (_, _, _, matmul_in_qkv) = v_nodes
-
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
-
- qk_paths = {
- "path1": (["Softmax", "MatMul"], [0, 0]),
- "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
- }
-
- qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return
-
- matmul_qk_add = None
- if qk_path == "path1":
- (_, matmul_qk) = qk_nodes
- else:
- (_, matmul_qk_add, matmul_qk) = qk_nodes
-
- q_paths = {"path1": (["Transpose", "Reshape", "Split"], [0, 0, 0])}
- q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
-
- if q_path == "path1":
- (_, reshape_q, split_q) = q_nodes
- # print(" split_q.name : ", split_q.name)
-
- k_paths = {
- "path1": (["Transpose", "Reshape", "Split"], [1, 0, 0]),
- }
- k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
-
- if k_path == "path1":
- (_, _, split_k) = k_nodes
-
- if (
- matmul_in_qkv.input[0] == root_input
- and split_q.input[0] == matmul_in_qkv.output[0]
- and split_k.input[0] == matmul_in_qkv.output[0]
- ):
- attention_last_node = reshape_qkv
-
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
-
- new_node, new_cast_node = self.create_attention_node(
- num_heads,
- hidden_size,
- matmul_in_qkv.output[0],
- attention_last_node.output[0],
- matmul_qk_add,
- )
- if new_node is None:
- return
-
- self.nodes_to_add.append(new_node)
- if new_cast_node:
- self.nodes_to_add.append(new_cast_node)
-
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- self.nodes_to_remove.extend(
- [attention_last_node, transpose_qkv, matmul_qkv]
- )
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes)
- self.nodes_to_remove.extend(k_nodes)
- self.nodes_to_remove.extend(v_nodes[:-2])
-
-
-class FusionT5DecoderAttention(Fusion):
- """
- Fuse T5Attention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQkvCrossToContext_IxRT",
- ["Softmax"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- q_shape = self.model.get_initializer(reshape_q.input[1])
- if q_shape is None:
- logger.debug(f"{reshape_q.input[1]} is not initializer.")
- return [0, 0]
-
- q_shape_value = NumpyHelper.to_array(q_shape)
- if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
- logger.debug(
- f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
- )
- return [0, 0]
-
- num_heads = q_shape_value[2]
- head_size = q_shape_value[3]
- hidden_size = num_heads * head_size
-
- return num_heads, hidden_size
-
- def create_decoder_attention_node(
- self, inputs: str, outputs: str, type_mask: int, has_mask: int
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
-
- attention_node_name = self.model.create_node_name("decoder_Attention")
- attention_node = helper.make_node(
- "CustomQkvCrossToContext_IxRT",
- inputs=inputs,
- outputs=outputs,
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("scale", 1.0)])
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)])
-
- return attention_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- """
- path1:
-
- (query) ---------------->MatMul --> add -->softmax --->MatMul--->
- / / /
- (key) ---->Transpose --> / /
- / /
- (mask) ------------------------> /
- /
- (value)--------------------------------------------->
-
-
-
- path2:
-
- (query) ---------------->MatMul ---------->softmax --->MatMul--->
- / /
- (key) ---->Transpose --> /
- /
- /
- /
- (value)--------------------------------------------->
-
- """
-
- start_node = node
- qkv_paths = {
- "path1": (
- ["Add", "MatMul", "Transpose"],
- [0, 0, 0],
- ), # float mask self attention,self attention key pass
- "path2": (["MatMul", "Transpose"], [0, 0]), # cross attention qery pass
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
- next_nodes = self.model.get_children(node)
- if len(next_nodes) == 0:
- return
-
- if next_nodes[0].op_type != "MatMul":
- return
-
- second_matmul_node = next_nodes[0]
- attention_inputs = None
- attention_outputs = second_matmul_node.output
- remove_nodes = [second_matmul_node, node]
- if qkv_path == "path1":
- (add_node, first_matmul_node, transpose_node) = qkv_nodes
- transpose_nodes = self.model.get_parents(first_matmul_node)
- q_input = transpose_nodes[0].output[0]
- k_input = transpose_nodes[1].input[0]
- v_input = second_matmul_node.input[1]
- attention_inputs = [q_input, k_input, v_input]
- remove_nodes.extend([add_node, first_matmul_node, transpose_nodes[1]])
-
- if qkv_path == "path2":
- (first_matmul_node, transpose_node) = qkv_nodes
- transpose_nodes = self.model.get_parents(first_matmul_node)
- q_input = transpose_nodes[0].output[0]
- k_input = transpose_nodes[1].input[0]
- v_input = second_matmul_node.input[1]
- attention_inputs = [q_input, k_input, v_input]
- remove_nodes.extend([first_matmul_node, transpose_nodes[1]])
-
- has_mask = 0
- type_mask = 4 # int32 mask
-
- if qkv_path == "path1":
- mask_input = add_node.input[0]
- score_out = first_matmul_node.output[0]
- if add_node.input[0] == score_out:
- mask_input = add_node.input[1]
- attention_inputs.append(mask_input)
- has_mask = 1
- type_mask = 3 # float mask
-
- atten_node = self.create_decoder_attention_node(
- attention_inputs, attention_outputs, type_mask, has_mask
- )
- self.nodes_to_add.append(atten_node)
- self.node_name_to_graph_name[atten_node.name] = self.this_graph_name
- self.nodes_to_remove.extend(remove_nodes)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
deleted file mode 100644
index 4765c8f51dbbf7b1f0da9e7821cc714665d1fbd8..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Tuple
-
-import numpy
-from numpy import array_equal, ndarray
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-from onnx import onnx_pb as onnx_proto
-
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionUtils:
- def __init__(self, model: OnnxModel):
- self.model: OnnxModel = model
-
- def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
- graph_input = self.model.find_graph_input(input_name)
- if (
- graph_input is not None
- and graph_input.type.tensor_type.elem_type != TensorProto.INT32
- ):
- cast_output, cast_node = self.cast_input_to_int32(input_name)
- logger.debug(f"Casted graph input {input_name} to int32")
- return True, cast_output
-
- logger.debug(
- f"Did not cast graph input {input_name} to int32: found {graph_input is not None}"
- )
- return False, input_name
-
- def cast_input_to_int32(self, input_name: str):
- cast_output = input_name + "_int32"
-
- # Avoid consequent Cast nodes.
- inputs = [input_name]
- output_name_to_node = self.model.output_name_to_node()
- if input_name in output_name_to_node:
- parent_node = output_name_to_node[input_name]
- if parent_node and parent_node.op_type == "Cast":
- inputs = [parent_node.input[0]]
-
- cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
- cast_node.attribute.extend(
- [helper.make_attribute("to", int(TensorProto.INT32))]
- )
- self.model.add_node(cast_node)
-
- return cast_output, cast_node
-
- def remove_cast_int32(self, input_name: str):
- input_name_to_nodes = self.model.input_name_to_nodes()
- nodes = input_name_to_nodes[input_name]
- for node in nodes:
- if node.op_type == "Cast":
- is_int32 = False
- for att in node.attribute:
- if att.name == "to" and att.i == int(TensorProto.INT32):
- is_int32 = True
- break
- if is_int32:
- output_name = node.output[0]
- self.model.remove_node(node)
- self.model.replace_input_of_all_nodes(output_name, input_name)
-
- @staticmethod
- def check_node_attribute(
- node, attribute_name: str, expected_value, default_value=None
- ):
- """Verify that a node has expected value for an attribute.
-
- Args:
- node (NodeProto): a node to check
- attribute_name (str): name of attribute
- expected_value (Any): expected value of the attribute
- default_value (Any, optional): default value if the attribute does not exist. Defaults to None.
-
- Returns:
- bool: whether the check is passed or not
- """
- value = default_value
- for attr in node.attribute:
- if attr.name == attribute_name:
- value = helper.get_attribute_value(attr)
-
- if isinstance(expected_value, list):
- return (
- isinstance(value, ndarray) or isinstance(value, list)
- ) and array_equal(expected_value, value, equal_nan=False)
- else:
- return value == expected_value
-
- @staticmethod
- def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
- """Transpose a 2-D INT8 TensorProto
- Args:
- tensor (TensorProto): tensor to be transposed
- Returns:
- tensor (TensorProto): transposed tensor
- """
- if not isinstance(tensor, onnx_proto.TensorProto):
- raise ValueError(
- "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
- )
-
- if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
- raise ValueError("Only INT8 2-D tensors can be transposed")
-
- if tensor.raw_data:
- int32_data = numpy.reshape(
- numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims
- )
- int32_transposed_data = numpy.transpose(int32_data, [1, 0])
- tensor.raw_data = int32_transposed_data.tobytes()
-
- else:
- raise ValueError("only raw buffer supported")
-
- return tensor
-
- @staticmethod
- def check_qdq_node_for_fusion(
- node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True
- ):
- """Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion.
- It is a good candidate for fusion if:
- (1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True`
- (2) The Q/DQ node should have constant scale
- (3) The Q/DQ node should have a zero point of 0
- Args:
- node (NodeProto): a Q/DQ node to check
- Returns:
- bool: whether the check is passed or not
- """
- if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
- logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
-
- scale = model.get_constant_value(node.input[1])
-
- # Scale is not constant
- if scale is None:
- return False
-
- # Not per-tensor quantization
- scale_has_single_element = scale.ndim == 0 or (
- scale.ndim == 1 and scale.shape[0] == 1
- )
- if allow_per_tensor_quantization_only and not scale_has_single_element:
- return False
-
- # If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec)
- if len(node.input) == 2:
- return True
-
- # Zero point should be constant and should have a value of 0
- zero_point = model.get_constant_value(node.input[2])
-
- # Zero point and scale should have same number of dims
- if scale.ndim != zero_point.ndim:
- return False
-
- # Zero point is not constant or zero point is not zero
- if zero_point is None:
- return False
-
- return numpy.all(zero_point == 0)
-
- def check_node_input_value(self, node, input_index: int, expected_value):
- """Verify that a node has expected input value
-
- Args:
- node (NodeProto): a node to check
- input_index (int): index of its input to be verified
- expected_value (Any): expected value of the input
-
- Returns:
- bool: whether the check is passed or not
- """
- assert len(node.input) > input_index
-
- value = self.model.get_constant_value(node.input[input_index])
-
- if isinstance(expected_value, list):
- return (
- isinstance(value, ndarray) or isinstance(value, list)
- ) and array_equal(expected_value, value, equal_nan=False)
- else:
- return value == expected_value
-
- def remove_identity_nodes(self):
- """Remove Identity nodes, except those right before graph output."""
- nodes_to_remove = []
- for node in self.model.nodes():
- if node.op_type == "Identity":
- if node.output[0] not in self.model.get_graphs_output_names():
- self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
- nodes_to_remove.append(node)
-
- if nodes_to_remove:
- self.model.remove_nodes(nodes_to_remove)
- logger.info(f"Removed {len(nodes_to_remove)} Identity nodes")
-
- def remove_cascaded_cast_nodes(self):
- self.model.remove_cascaded_cast_nodes()
-
- def remove_useless_cast_nodes(self):
- self.model.remove_useless_cast_nodes()
-
- def remove_useless_reshape_nodes(self):
- """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape"""
- shape_infer = self.model.infer_runtime_shape(update=True)
- if shape_infer is None:
- return
-
- nodes_to_remove = []
- for node in self.model.nodes():
- if node.op_type == "Reshape":
- input_shape = shape_infer.get_edge_shape(node.input[0])
- output_shape = shape_infer.get_edge_shape(node.output[0])
- if input_shape and output_shape and input_shape == output_shape:
- logger.info(
- f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}"
- )
- nodes_to_remove.append(node)
-
- if nodes_to_remove:
- graph_input_names = set(self.model.get_graphs_input_names())
- graph_output_names = set(self.model.get_graphs_output_names())
- for node in nodes_to_remove:
- if bool(set(node.output) & graph_output_names):
- if not bool(set(node.input) & graph_input_names):
- self.model.replace_output_of_all_nodes(
- node.input[0], node.output[0]
- )
- else:
- continue
- else:
- self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
- self.model.remove_node(node)
-
-
-class NumpyHelper:
- @staticmethod
- def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
- # When weights are in external data format but not presented, we can still test the optimizer with two changes:
- # (1) set fill_zeros = True (2) change load_external_data=False in optimizer.py
- if fill_zeros:
- from onnx import mapping
-
- return ndarray(
- shape=tensor.dims,
- dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type],
- )
-
- return numpy_helper.to_array(tensor)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
deleted file mode 100644
index d3244b7a609da3d8bfda6f91ed606259093e59c4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionVideoBertAttention(Fusion):
- """
- Fuse VideoBertAttention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQKVToContextPluginDynamic_IxRT",
- ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(
- self, atten_matmul: NodeProto, div: NodeProto
- ) -> Tuple[int, int]:
- """Detect num_heads and hidden_size from a reshape node.
-
- Args:
- reshape_q (NodeProto): reshape node for Q
-
- Returns:
- Tuple[int, int]: num_heads and hidden_size
- """
-
- # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
- atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
- div_initializer = self.model.get_initializer(div.input[1])
-
- # 检查float_data是否为空
- if len(div_initializer.float_data) > 0:
- div_value = div_initializer.float_data[0]
- else:
- # 如果float_data为空,尝试其他方式获取数据
- # 例如,如果数据存储在raw_data中
- if len(div_initializer.raw_data) > 0:
- dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
- div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
- else:
- raise ValueError("Data not found in the div_initializer")
-
- atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
- head_dim = math.ceil(div_value * div_value)
- hidden_size = atten_matul_shape_value[0]
- num_heads = hidden_size // head_dim
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self,
- num_heads: int,
- hidden_size: int,
- input: str,
- output: str,
- matmul_qk_add: NodeProto,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
-
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- attention_node_name = self.model.create_node_name("Attention")
-
- qk_bias = None
- has_mask = 0
- has_qk_bias = 0
- if matmul_qk_add is not None:
- has_qk_bias = 1
- qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
- qk_bias_arr = NumpyHelper.to_array(qk_bias)
- if len(qk_bias_arr.shape) == 3:
- qk_bias_arr = qk_bias_arr.squeeze(0)
- has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
- if np.any(has_neg_inf):
- qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
- np.float32
- )
- qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
-
- attention_inputs = [input]
-
- if qk_bias is not None:
- has_mask = 1
- attention_inputs.append(qk_bias.name)
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend(
- [helper.make_attribute("has_qk_bias", has_qk_bias)]
- )
-
- return attention_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
- if normalize_node.op_type == "LayerNormalization":
- add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
- if add_before_layernorm is not None:
- start_node = add_before_layernorm
-
- # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
- qkv_paths = {
- "path1": (
- ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
- [0, None, 0, 0, 0],
- ),
- "path2": (
- ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
- [1, None, 0, 0, 0],
- ),
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
-
- if qkv_path in ["path1", "path2"]:
- (_, atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
-
- other_inputs = []
- for i, input in enumerate(start_node.input):
- if input not in output_name_to_node:
- continue
-
- if input == qkv_nodes[0].output[0]:
- continue
- other_inputs.append(input)
- if len(other_inputs) != 1:
- return
-
- root_input = other_inputs[0]
- """
- Match videobert
- transpose/Add --> LayerNormalization --> Attention --> Add --> LayerNormalization
- | |
- | |
- +---------------------------------------------------------
- """
- transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
- if transpose_before_layernorm is not None:
- node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- add_before_layernorm = self.model.match_parent(start_node, "Add", None)
- if add_before_layernorm is not None:
- node_children = input_name_to_nodes[add_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- v_paths = {
- "path1": (
- ["Transpose", "Reshape", "Slice", "Add", "MatMul"],
- [1, 0, 0, 0, None],
- ) # videobert
- }
-
- v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
- if v_path == "path1":
- (_, _, _, add_in_qkv, matmul_in_qkv) = v_nodes
-
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
-
- qk_paths = {
- "path1": (["Softmax", "MatMul"], [0, 0]),
- "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
- }
-
- qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return
-
- matmul_qk_add = None
- if qk_path == "path1":
- (_, matmul_qk) = qk_nodes
- else:
- (_, matmul_qk_add, matmul_qk) = qk_nodes
-
- q_paths = {
- "path1": (["Transpose", "Reshape", "Slice"], [0, 0, 0]),
- "path2": (["Div", "Transpose", "Reshape", "Slice"], [0, 0, 0, 0]),
- }
- q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
-
- if q_path == "path1":
- (_, _, slice_q) = q_nodes
- else:
- (div, _, _, slice_q) = q_nodes
-
- k_paths = {
- "path1": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
- "path2": (["Div", "Transpose", "Reshape", "Slice"], [1, 0, 0, 0]),
- }
- k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
-
- if k_path == "path1":
- (_, _, slice_k) = k_nodes
- else:
- (div, _, _, slice_k) = k_nodes
-
- if (
- matmul_in_qkv.input[0] == root_input
- and slice_q.input[0] == add_in_qkv.output[0]
- and slice_k.input[0] == add_in_qkv.output[0]
- ):
- attention_last_node = reshape_qkv
-
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(
- atten_matmul, div
- )
-
- new_node = self.create_attention_node(
- num_heads,
- hidden_size,
- add_in_qkv.output[0],
- attention_last_node.output[0],
- matmul_qk_add,
- )
- if new_node is None:
- return
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- self.nodes_to_remove.extend(
- [attention_last_node, transpose_qkv, matmul_qkv]
- )
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes)
- self.nodes_to_remove.extend(k_nodes)
- self.nodes_to_remove.extend(v_nodes[:-2])
-
- # fuse head and tail transpose
- if transpose_before_layernorm is not None:
- node_children = input_name_to_nodes[
- transpose_before_layernorm.output[0]
- ]
- for child in node_children:
- for i, input in enumerate(child.input):
- if child.input[i] == transpose_before_layernorm.output[0]:
- child.input[i] = transpose_before_layernorm.input[0]
- self.nodes_to_remove.extend([transpose_before_layernorm])
-
- node = transpose_before_layernorm
- while True:
- found = False
- node_children = input_name_to_nodes[node.output[0]]
- for child in node_children:
- if child is not None and child.op_type in [
- "SkipLayerNorm",
- "Add",
- ]:
- node = child
- found = True
- break
- if not found:
- break
- node_children = input_name_to_nodes[node.output[0]]
- if len(node_children) == 1 and node_children[0].op_type == "Transpose":
- transpose_node = node_children[0]
- transpose_children = input_name_to_nodes[transpose_node.output[0]]
- for i, input in enumerate(transpose_children[0].input):
- if transpose_children[0].input[i] == transpose_node.output[0]:
- transpose_children[0].input[i] = transpose_node.input[0]
- self.nodes_to_remove.extend([transpose_node])
- # Use prune graph to remove mask nodes since they are shared by all attention nodes.
- # self.nodes_to_remove.extend(mask_nodes)
- # self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
deleted file mode 100644
index f1a5410b62283e45f4f0a8957eaf7e83be6a6124..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
+++ /dev/null
@@ -1,469 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from typing import Dict
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionVITAttention(Fusion):
- """
- Fuse VITAttention subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(
- model,
- "CustomQKVToContextPluginDynamic_IxRT",
- ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
- )
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def get_num_heads_and_hidden_size(
- self, custom_fc: NodeProto, mul: NodeProto
- ) -> Tuple[int, int]:
- mul_initializer = self.model.get_initializer(mul.input[1])
-
- # 检查float_data是否为空
- if len(mul_initializer.float_data) > 0:
- mul_value = mul_initializer.float_data[0]
- else:
- # 如果float_data为空,尝试其他方式获取数据
- # 例如,如果数据存储在raw_data中
- if len(mul_initializer.raw_data) > 0:
- dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
- mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
- else:
- raise ValueError("Data not found in the mul_initializer")
-
- for attr in custom_fc.attribute:
- if attr.name == "W":
- tensor_value = attr.t
- tensor_shape = [dim for dim in tensor_value.dims]
- break
- head_dim = math.floor(1.0 / (mul_value * mul_value)) * math.floor(
- 1.0 / (mul_value * mul_value)
- )
- hidden_size = tensor_shape[0]
- num_heads = hidden_size // head_dim
-
- return num_heads, hidden_size
-
- def create_attention_node(
- self,
- num_heads: int,
- hidden_size: int,
- input: str,
- output: str,
- matmul_qk_add: NodeProto,
- ) -> Union[NodeProto, None]:
- """Create an Attention node.
-
- Args:
- num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
- hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
- input (str): input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- assert num_heads > 0
- # print(hidden_size, num_heads)
- if hidden_size > 0 and (hidden_size % num_heads) != 0:
- logger.debug(
- f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
- )
- return None
-
- attention_node_name = self.model.create_node_name("Attention")
-
- qk_bias = None
- has_mask = 0
- has_qk_bias = 0
- if matmul_qk_add is not None:
- has_qk_bias = 1
- qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
- qk_bias_arr = NumpyHelper.to_array(qk_bias)
- if len(qk_bias_arr.shape) == 3:
- qk_bias_arr = qk_bias_arr.squeeze(0)
- has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
- if np.any(has_neg_inf):
- qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
- np.float32
- )
- qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
-
- attention_inputs = [input]
-
- if qk_bias is not None:
- has_mask = 1
- attention_inputs.append(qk_bias.name)
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=attention_inputs,
- outputs=[output],
- name=attention_node_name,
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend(
- [helper.make_attribute("hidden_size", hidden_size)]
- )
- attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend(
- [helper.make_attribute("has_qk_bias", has_qk_bias)]
- )
-
- return attention_node
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
- # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
- start_node = normalize_node
- if normalize_node.op_type == "LayerNormalization":
- add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
- if add_before_layernorm is not None:
- start_node = add_before_layernorm
-
- # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
- qkv_paths = {
- "path1": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [0, 0, 0]),
- "path2": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [1, 0, 0]),
- }
-
- qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
- if qkv_nodes is None:
- logger.debug("fuse_attention: failed to match qkv path")
- return
-
- if qkv_path in ["path1", "path2"]:
- (custom_fc_after_atten, transpose_qkv, matmul_qkv) = qkv_nodes
-
- other_inputs = []
- for i, input in enumerate(start_node.input):
- if input not in output_name_to_node:
- continue
-
- if input == qkv_nodes[0].output[0]:
- continue
- other_inputs.append(input)
- if len(other_inputs) != 1:
- return
-
- root_input = other_inputs[0]
- """
- Match VIT
- transpose --> LayerNormalization --> custom_fc -> attention -> Add
- | |
- | |
- +-------------------------------------------------------------------
- """
- transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
- if transpose_before_layernorm is not None:
- node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- add_before_layernorm = self.model.match_parent(start_node, "Add", None)
- if add_before_layernorm is not None:
- node_children = input_name_to_nodes[add_before_layernorm.output[0]]
- for child in node_children:
- if child is not None and child.op_type == "LayerNormalization":
- root_input = child.output[0]
-
- # print("root_input: ", root_input, matmul_qkv.name)
- v_paths = {
- "path1": (
- [
- "Reshape",
- "Transpose",
- "Reshape",
- "Gather",
- "Squeeze",
- "Transpose",
- "Unsqueeze",
- "Reshape",
- "CustomFCPluginDynamic_IxRT",
- ],
- [1, 0, 0, 0, 0, 0, 0, 0, 0],
- ) # vit
- }
-
- v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-
- squeeze_input = custom_fc = None
- if v_path == "path1":
- (_, _, _, _, squeeze_input, _, _, _, custom_fc) = v_nodes
-
- if v_nodes is None:
- logger.debug("fuse_attention: failed to match v path")
- return
-
- qk_paths = {
- "path1": (["Softmax", "MatMul"], [0, 0]),
- "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
- }
-
- qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
- # print("qk_nodes:", qk_nodes[1].name)
- if qk_nodes is None:
- logger.debug("fuse_attention: failed to match qk path")
- return
-
- matmul_qk_add = None
- if qk_path == "path1":
- (_, matmul_qk) = qk_nodes
- else:
- (_, matmul_qk_add, matmul_qk) = qk_nodes
-
- q_paths = {
- "path1": (
- ["Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze"],
- [0, 0, 0, 0, 0, 0],
- ),
- }
- q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
- # print("q_nodes:", q_nodes[0].name)
- squeeze_q = mul_q = None
- if q_path == "path1":
- squeeze_q = q_nodes[-1]
- mul_q = q_nodes[0]
-
- if q_nodes is None:
- logger.debug("fuse_attention: failed to match q path")
- return
-
- k_paths = {
- "path1": (
- [
- "Mul",
- "Transpose",
- "Reshape",
- "Transpose",
- "Reshape",
- "Gather",
- "Squeeze",
- ],
- [1, 0, 0, 0, 0, 0, 0],
- ),
- }
- k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
- # print("k_nodes:", k_nodes[0].name)
- squeeze_k = None
- if k_path == "path1":
- squeeze_k = k_nodes[-1]
-
- if k_nodes is None:
- logger.debug("fuse_attention: failed to match k path")
- return
-
- if (
- custom_fc.input[0] == root_input
- and squeeze_input == squeeze_q
- and squeeze_input == squeeze_k
- ):
- attention_last_node = transpose_qkv
-
- num_heads, hidden_size = self.get_num_heads_and_hidden_size(
- custom_fc_after_atten, mul_q
- )
-
- new_node = self.create_attention_node(
- num_heads,
- hidden_size,
- custom_fc.output[0],
- attention_last_node.output[0],
- matmul_qk_add,
- )
- if new_node is None:
- return
-
- self.nodes_to_add.append(new_node)
- self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
- self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
- self.nodes_to_remove.extend(qk_nodes)
- self.nodes_to_remove.extend(q_nodes[:-1])
- self.nodes_to_remove.extend(k_nodes[:-1])
- self.nodes_to_remove.extend(v_nodes[:-1])
-
- # fuse head and tail transpose
- if transpose_before_layernorm is not None:
- node_children = input_name_to_nodes[
- transpose_before_layernorm.output[0]
- ]
- for child in node_children:
- for i, input in enumerate(child.input):
- if child.input[i] == transpose_before_layernorm.output[0]:
- child.input[i] = transpose_before_layernorm.input[0]
- self.nodes_to_remove.extend([transpose_before_layernorm])
-
- node = transpose_before_layernorm
- while True:
- found = False
- node_children = input_name_to_nodes[node.output[0]]
- for child in node_children:
- if child is not None and child.op_type in [
- "SkipLayerNorm",
- "Add",
- ]:
- node = child
- found = True
- break
- if not found:
- break
- node_children = input_name_to_nodes[node.output[0]]
- if len(node_children) == 1 and node_children[0].op_type == "Transpose":
- transpose_node = node_children[0]
- transpose_children = input_name_to_nodes[transpose_node.output[0]]
- for i, input in enumerate(transpose_children[0].input):
- if transpose_children[0].input[i] == transpose_node.output[0]:
- transpose_children[0].input[i] = transpose_node.input[0]
- self.nodes_to_remove.extend([transpose_node])
- # Use prune graph to remove mask nodes since they are shared by all attention nodes.
- # self.nodes_to_remove.extend(mask_nodes)
- # self.prune_graph = True
-
-
-class FusionTorchvisionVITAttention(Fusion):
- """
- Fuse VITAttention subgraph into one Attention node.
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(
- model, "CustomQKVToContextPluginDynamic_IxRT", "CustomFCPluginDynamic_IxRT"
- )
-
- def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
- """
- [Root] --> CustomFCPluginDynamic_IxRT--> CustomQKVToContextPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT
- """
- children = self.model.get_children(node, input_name_to_nodes)
- parent = self.model.get_parents(node, output_name_to_node)
-
- if len(children) != 1:
- return
- if len(parent) != 1:
- return
-
- fc_first_node = None
- for par in parent:
- fc_first_node = self.model.find_first_parent_by_type(
- par, "CustomFCPluginDynamic_IxRT", output_name_to_node, recursive=True
- )
- if fc_first_node is not None:
- break
- if fc_first_node is None:
- return
-
- start_node = node
-
- # v path
- v_nodes = self.model.match_parent_path(
- start_node,
- ["Transpose", "MatMul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"],
- [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- # path1, q and k path
- q_nodes = self.model.match_parent_path(
- start_node,
- ["Transpose", "MatMul", "Softmax", "MatMul", "Mul", "Transpose", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"],
- [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- k_nodes = self.model.match_parent_path(
- start_node,
- ["Transpose", "MatMul", "Softmax", "MatMul", "Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"],
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- output_name_to_node,
- )
-
- if v_nodes is None:
- return
-
- if v_nodes and q_nodes and k_nodes:
- subgraph_nodes = []
- subgraph_nodes.extend(q_nodes)
- subgraph_nodes.extend(k_nodes)
- subgraph_nodes.extend(v_nodes)
-
- subgraph_nodes_unique = []
- for item in subgraph_nodes:
- if item not in subgraph_nodes_unique:
- subgraph_nodes_unique.append(item)
-
- hidden_size = start_node.attribute[0].i
- _, mul_val = self.model.get_constant_input(k_nodes[4])
- num_heads = hidden_size // (math.floor(1.0 / (mul_val * mul_val)) * math.floor(1.0 / (mul_val * mul_val)))
-
- attention_node = helper.make_node(
- "CustomQKVToContextPluginDynamic_IxRT",
- inputs=[fc_first_node.output[0]],
- outputs=[start_node.input[0]],
- name=self.model.create_node_name(
- "TorchvisionVitAttention", name_prefix="TorchvisionVitAttention"
- ),
- )
- attention_node.domain = "com.iluvatar"
- attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
- attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
- attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
- attention_node.attribute.extend([helper.make_attribute("has_mask", 0)])
- attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 0)])
-
- self.nodes_to_remove.extend(subgraph_nodes_unique)
-
- self.nodes_to_add.append(attention_node)
- self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
deleted file mode 100644
index df55ba645988ddbffcd157e38db2c73ff34789a2..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionXSoftmax(Fusion):
- """
- Fuse Where + Softmax + Where into one node: XSoftmax
- """
-
- def __init__(self, model: OnnxModel):
- super().__init__(model, "XSoftmax_IxRT", "MatMul")
-
- def create_xsoftmax_node(
- self, data_input: str, mask_input: str, output: str
- ) -> Union[NodeProto, None]:
- """Create an XSoftmax node.
-
- Args:
- data_input (str): data input name
- mask_input (str): max input name
- output (str): output name
-
- Returns:
- Union[NodeProto, None]: the node created or None if failed.
- """
- xsoftmax_node_name = self.model.create_node_name("XSoftmax")
-
- xsoftmax_node = helper.make_node(
- "XSoftmax_IxRT",
- inputs=[data_input, mask_input],
- outputs=[output],
- name=xsoftmax_node_name,
- )
- xsoftmax_node.domain = "com.iluvatar"
- xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
- xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
- xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
- xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
-
- return xsoftmax_node
-
- def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
- xsoftmax_paths = {
- "path": (["Where", "Softmax", "Where", "Add"], [None, None, None, None]),
- }
- xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
- node, xsoftmax_paths
- )
-
- if xsoftmax_nodes is None:
- logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
- return
- else:
- (tail_where, softmax, head_where, add) = xsoftmax_nodes
- where_inputs = [i for i in tail_where.input if i in head_where.input]
- assert len(where_inputs) == 1
- mask_input = where_inputs[0]
- data_input = add.output[0]
- data_output = tail_where.output[0]
-
- xsoftmax_node = self.create_xsoftmax_node(
- data_input, mask_input, data_output
- )
-
- self.nodes_to_add.append(xsoftmax_node)
- self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
- self.nodes_to_remove.append(tail_where)
- self.nodes_to_remove.append(softmax)
- self.nodes_to_remove.append(head_where)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
deleted file mode 100644
index f2d07ce96d60c5e8fbfc749d1049bad471525239..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import List, Tuple, Union
-
-import numpy as np
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-def get_tensor_attr(attrs, attr_name):
- result = None
- for i in attrs:
- if i.name == attr_name:
- return numpy_helper.to_array(i.t)
- return result
-
-
-class FusionYoloV5Decoder(Fusion):
- """
- Fuse SwinL subgraph into one Attention node.
- """
-
- def __init__(
- self,
- model: OnnxModel,
- ):
- super().__init__(model, "YoloV5Decoder", ["Reshape"])
-
- # Flags to show warning only once
- self.num_heads_warning = True
- self.hidden_size_warning = True
-
- def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
- short_path = ["Concat", "Slice", "Sigmoid", "Transpose", "Reshape"]
- paths = [
- (["Concat", "Unsqueeze", "Gather", "Shape"], [1] + [None] * 3),
- (
- ["Concat", "Mul", "Add", "Sub", "Mul", "Slice", "Sigmoid", "Transpose"],
- [0, 0] + [None] * 6,
- ),
- (
- ["Concat", "Mul", "Pow", "Mul", "Slice", "Sigmoid", "Transpose"],
- [0, 1] + [None] * 5,
- ),
- (short_path, [None] * 5),
- (short_path + ["Concat", "Unsqueeze", "Gather", "Shape"], [None] * 9),
- ]
- paths_found = []
- nodes_names_found = set()
- nodes_found = []
- for path_i in paths:
- nodes = self.model.match_parent_path(normalize_node, path_i[0], path_i[1])
- paths_found.append(nodes)
- if nodes:
- for n in nodes:
- if n.name not in nodes_names_found:
- nodes_names_found.add(n.name)
- nodes_found.append(n)
- if not all(paths_found):
- return
- shape_node = paths_found[-1][-1]
- params = self._find_yolov5_decoder_params(paths_found)
- self._fuse_node(
- inputs=shape_node.input, outputs=normalize_node.output, params=params
- )
- self.nodes_to_remove.extend(nodes_found)
- self._delete_extra_output_edges(paths_found)
- self.prune_graph = True
-
- def _fuse_node(self, inputs, outputs, params):
- fused_node = helper.make_node(
- "YoloV5Decoder",
- inputs=inputs,
- outputs=outputs,
- name=self.model.create_node_name("YoloV5Decoder"),
- )
- fused_node.attribute.extend(params)
- self.nodes_to_add.append(fused_node)
- self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-
- def _delete_extra_output_edges(self, paths_found):
- transpose_node = paths_found[2][-1]
- assert transpose_node.op_type == "Transpose"
- out_edge = transpose_node.output[0]
- for item in self.model.graph().output:
- if item.name == out_edge:
- self.model.graph().output.remove(item)
- logger.warning(f"Output: {out_edge} is useless in graph, delete it")
- return
-
- def _find_yolov5_decoder_params(self, paths_found):
- # num_class
- concat_op = paths_found[0][0]
- assert concat_op.op_type == "Concat"
- num_class_arr = self.model.get_initializer(concat_op.input[2], True)
- assert num_class_arr
- num_class = (num_class_arr - 5).tolist()[0]
- num_class = helper.make_attribute("num_class", num_class)
-
- # stride
- mul_op = paths_found[1][1]
- assert mul_op.op_type == "Mul"
- input_arrs = self.model.get_initializer_input_edges(mul_op.name, True)
- assert len(input_arrs) == 1
- stride = input_arrs[0].tolist()
- stride = helper.make_attribute("stride", stride)
-
- # anchor
- mul_op = paths_found[2][1]
- assert mul_op.op_type == "Mul"
- anchor = self.model.get_initializer_input_edges(mul_op.name, True)
- assert len(anchor) == 1
- anchor = anchor[0]
- anchor = anchor[0, :, 0, 0, :] if len(anchor.shape) == 5 else anchor[:, 0, 0, :]
- anchor = helper.make_attribute("anchor", list(anchor.flatten()))
-
- # fast_impl
- fast_impl = helper.make_attribute("faster_impl", 1)
-
- return [num_class, stride, anchor, fast_impl]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
deleted file mode 100644
index 0b76f660fce62ec0aa19b8c132a6ba51cf6fe319..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-import logging
-import os
-import sys
-from collections import deque
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-from onnx import (
- AttributeProto,
- GraphProto,
- ModelProto,
- NodeProto,
- TensorProto,
- helper,
- numpy_helper,
- save_model,
-)
-
-from .float16 import convert_float_to_float16
-from .shape_infer_helper import SymbolicShapeInferenceHelper
-
-logger = logging.getLogger(__name__)
-
-
-class OnnxModel:
- def __init__(self, model):
- self.initialize(model)
- self.initializer_visited: Dict[str, bool] = {}
-
- def initialize(self, model):
- self.model: ModelProto = model
- self._node_name_suffix: Dict[
- str, int
- ] = {} # key is node name prefix, value is the last suffix generated
- self.shape_infer_helper: SymbolicShapeInferenceHelper = None
- self.enable_shape_infer: bool = True
- self.all_graphs: Optional[List[GraphProto]] = None
-
- def disable_shape_inference(self):
- self.enable_shape_infer = False
-
- def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
- if self.enable_shape_infer:
- if self.shape_infer_helper is None or update:
- self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
-
- try:
- if self.shape_infer_helper.infer(dynamic_axis_mapping):
- return self.shape_infer_helper
- except:
- self.enable_shape_infer = (
- False # disable shape inference to suppress same error message.
- )
- print("failed in shape inference", sys.exc_info()[0])
-
- return None
-
- def input_name_to_nodes(self):
- input_name_to_nodes = {}
- for node in self.nodes():
- for input_name in node.input:
- if input_name not in input_name_to_nodes:
- input_name_to_nodes[input_name] = [node]
- else:
- input_name_to_nodes[input_name].append(node)
- return input_name_to_nodes
-
- def output_name_to_node(self):
- output_name_to_node = {}
- for node in self.nodes():
- for output_name in node.output:
- output_name_to_node[output_name] = node
- return output_name_to_node
-
- def nodes(self):
- all_nodes = []
- for graph in self.graphs():
- for node in graph.node:
- all_nodes.append(node)
- return all_nodes
-
- def graph(self):
- return self.model.graph
-
- def graphs(self):
- if self.all_graphs is not None:
- return self.all_graphs
- self.all_graphs = []
- graph_queue = [self.model.graph]
- while graph_queue:
- graph = graph_queue.pop(0)
- self.all_graphs.append(graph)
- for node in graph.node:
- for attr in node.attribute:
- if attr.type == AttributeProto.AttributeType.GRAPH:
- assert isinstance(attr.g, GraphProto)
- graph_queue.append(attr.g)
- if attr.type == AttributeProto.AttributeType.GRAPHS:
- for g in attr.graphs:
- assert isinstance(g, GraphProto)
- graph_queue.append(g)
- return self.all_graphs
-
- def get_graphs_input_names(self):
- input_names = []
- for graph in self.graphs():
- for input in graph.input:
- input_names.append(input.name)
- return input_names
-
- def get_graphs_output_names(self):
- output_names = []
- for graph in self.graphs():
- for output in graph.output:
- output_names.append(output.name)
- return output_names
-
- def get_graph_by_node(self, node):
- for graph in self.graphs():
- if node in graph.node:
- return graph
- return None
-
- def get_graph_by_name(self, graph_name):
- for graph in self.graphs():
- if graph_name == graph.name:
- return graph
- return None
-
- def get_topological_insert_id(self, graph, outputs):
- for idx, node in enumerate(graph.node):
- for input in node.input:
- if input in outputs:
- return idx
- return len(graph.node)
-
- def remove_node(self, node):
- for graph in self.graphs():
- if node in graph.node:
- graph.node.remove(node)
-
- def remove_nodes(self, nodes_to_remove):
- for node in nodes_to_remove:
- self.remove_node(node)
-
- def add_node(self, node, graph_name=None):
- if graph_name is None or graph_name == self.model.graph.name:
- self.model.graph.node.extend([node])
- else:
- graph = self.get_graph_by_name(graph_name)
- insert_idx = self.get_topological_insert_id(graph, node.output)
- graph.node.insert(insert_idx, node)
-
- def add_nodes(self, nodes_to_add, node_name_to_graph_name=None):
- if node_name_to_graph_name is None:
- self.model.graph.node.extend(nodes_to_add)
- else:
- for node in nodes_to_add:
- graph_name = node_name_to_graph_name[node.name]
- self.add_node(node, graph_name)
-
- def add_initializer(self, tensor, graph_name=None):
- if graph_name is None or graph_name == self.model.graph.name:
- self.model.graph.initializer.extend([tensor])
- else:
- graph = self.get_graph_by_name(graph_name)
- graph.initializer.extend([tensor])
-
- def add_input(self, input, graph_name=None):
- if graph_name is None or graph_name == self.model.graph.name:
- self.model.graph.input.extend([input])
- else:
- graph = self.get_graph_by_name(graph_name)
- graph.input.extend([input])
-
- @staticmethod
- def replace_node_input(node, old_input_name, new_input_name):
- assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
- for j in range(len(node.input)):
- if node.input[j] == old_input_name:
- node.input[j] = new_input_name
-
- def replace_input_of_all_nodes(self, old_input_name, new_input_name):
- for node in self.model.graph.node:
- OnnxModel.replace_node_input(node, old_input_name, new_input_name)
-
- @staticmethod
- def replace_node_output(node, old_output_name, new_output_name):
- assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
- for j in range(len(node.output)):
- if node.output[j] == old_output_name:
- node.output[j] = new_output_name
-
- def replace_output_of_all_nodes(self, old_output_name, new_output_name):
- for node in self.model.graph.node:
- OnnxModel.replace_node_output(node, old_output_name, new_output_name)
-
- def get_initializer(self, name, return_np_array=False):
- for graph in self.graphs():
- for tensor in graph.initializer:
- if tensor.name == name:
- return numpy_helper.to_array(tensor) if return_np_array else tensor
- return None
-
- def get_node(self, op_name):
- for graph in self.graphs():
- for n in graph.node:
- if n.name == op_name:
- return n
- return None
-
- def get_initializer_input_edges(self, op_name, return_np_array=False):
- initializers = {i.name: i for graph in self.graphs() for i in graph.initializer}
- node = self.get_node(op_name)
- assert node
- result = []
- for i in node.input:
- if i in initializers:
- tensor = initializers[i]
- tensor = numpy_helper.to_array(tensor) if return_np_array else tensor
- result.append(tensor)
- return result
-
- def get_nodes_by_op_type(self, op_type):
- nodes = []
- for node in self.nodes():
- if node.op_type == op_type:
- nodes.append(node)
- return nodes
-
- def get_children(self, node, input_name_to_nodes=None):
- if input_name_to_nodes is None:
- input_name_to_nodes = self.input_name_to_nodes()
-
- children = []
- for output in node.output:
- if output in input_name_to_nodes:
- for node in input_name_to_nodes[output]:
- children.append(node)
- return children
-
- def get_parents(self, node, output_name_to_node=None):
- if output_name_to_node is None:
- output_name_to_node = self.output_name_to_node()
-
- parents = []
- for input in node.input:
- if input in output_name_to_node:
- parents.append(output_name_to_node[input])
- return parents
-
- def get_parent(self, node, i, output_name_to_node=None):
- if output_name_to_node is None:
- output_name_to_node = self.output_name_to_node()
-
- if len(node.input) <= i:
- return None
-
- input = node.input[i]
- if input not in output_name_to_node:
- return None
-
- return output_name_to_node[input]
-
- def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
- """
- Find parent node based on constraints on op_type.
-
- Args:
- node (str): current node name.
- parent_op_type (str): constraint of parent node op_type.
- output_name_to_node (dict): dictionary with output name as key, and node as value.
- exclude (list): list of nodes that are excluded (not allowed to match as parent).
-
- Returns:
- parent: The matched parent node. None if not found.
- index: The input index of matched parent node. None if not found.
- """
- for i, input in enumerate(node.input):
- if input in output_name_to_node:
- parent = output_name_to_node[input]
- if parent.op_type == parent_op_type and parent not in exclude:
- return parent, i
- else:
- logger.debug(
- f"To find first {parent_op_type}, current {parent.op_type}"
- )
- return None, None
-
- def match_parent(
- self,
- node,
- parent_op_type,
- input_index=None,
- output_name_to_node=None,
- exclude=[],
- return_indice=None,
- ):
- """
- Find parent node based on constraints on op_type and index.
- When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
-
- Args:
- node (str): current node name.
- parent_op_type (str): constraint of parent node op_type.
- input_index (int or None): only check the parent given input index of current node.
- output_name_to_node (dict): dictionary with output name as key, and node as value.
- exclude (list): list of nodes that are excluded (not allowed to match as parent).
- return_indice (list): a list to append the input index when input_index is None.
-
- Returns:
- parent: The matched parent node.
- """
- assert node is not None
- assert input_index is None or input_index >= 0
-
- if output_name_to_node is None:
- output_name_to_node = self.output_name_to_node()
-
- if input_index is None:
- parent, index = self.match_first_parent(
- node, parent_op_type, output_name_to_node, exclude
- )
- if return_indice is not None:
- return_indice.append(index)
- return parent
-
- if input_index >= len(node.input):
- logger.debug(f"input_index {input_index} >= node inputs {len(node.input)}")
- return None
-
- parent = self.get_parent(node, input_index, output_name_to_node)
- if (
- parent is not None
- and parent.op_type == parent_op_type
- and parent not in exclude
- ):
- return parent
-
- if parent is not None:
- logger.debug(f"Expect {parent_op_type}, Got {parent.op_type}")
-
- return None
-
- def match_parent_paths(self, node, paths, output_name_to_node):
- for i, path in enumerate(paths):
- assert isinstance(path, List) or isinstance(path, Tuple)
- return_indice = []
- matched = self.match_parent_path(
- node, path[0], path[1], output_name_to_node, return_indice
- )
- if matched:
- return i, matched, return_indice
- return -1, None, None
-
- def match_parent_path(
- self,
- node,
- parent_op_types,
- parent_input_index,
- output_name_to_node=None,
- return_indice=None,
- ):
- """
- Find a sequence of input edges based on constraints on parent op_type and index.
- When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
-
- Args:
- node (str): current node name.
- parent_op_types (str): constraint of parent node op_type of each input edge.
- parent_input_index (list): constraint of input index of each input edge. None means no constraint.
- output_name_to_node (dict): dictionary with output name as key, and node as value.
- return_indice (list): a list to append the input index when there is no constraint on input index of an edge.
-
- Returns:
- parents: a list of matched parent node.
- """
- assert len(parent_input_index) == len(parent_op_types)
-
- if output_name_to_node is None:
- output_name_to_node = self.output_name_to_node()
-
- current_node = node
- matched_parents = []
- for i, op_type in enumerate(parent_op_types):
- matched_parent = self.match_parent(
- current_node,
- op_type,
- parent_input_index[i],
- output_name_to_node,
- exclude=[],
- return_indice=return_indice,
- )
- if matched_parent is None:
- logger.debug(
- f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}",
- stack_info=True,
- )
- return None
-
- matched_parents.append(matched_parent)
- current_node = matched_parent
-
- return matched_parents
-
- def find_first_child_by_type(
- self, node, child_type, input_name_to_nodes=None, recursive=True
- ):
- children = self.get_children(node, input_name_to_nodes)
- dq = deque(children)
- while len(dq) > 0:
- current_node = dq.pop()
- if current_node.op_type == child_type:
- return current_node
-
- if recursive:
- children = self.get_children(current_node, input_name_to_nodes)
- for child in children:
- dq.appendleft(child)
-
- return None
-
- def find_first_parent_by_type(
- self, node, parent_type, output_name_to_node=None, recursive=True
- ):
- if output_name_to_node is None:
- output_name_to_node = self.output_name_to_node()
-
- parents = self.get_parents(node, output_name_to_node)
- dq = deque(parents)
- while len(dq) > 0:
- current_node = dq.pop()
- if current_node.op_type == parent_type:
- return current_node
-
- if recursive:
- parents = self.get_parents(current_node, output_name_to_node)
- for parent in parents:
- dq.appendleft(parent)
-
- return None
-
- def get_constant_value(self, output_name):
- for node in self.get_nodes_by_op_type("Constant"):
- if node.output[0] == output_name:
- for att in node.attribute:
- if att.name == "value":
- return numpy_helper.to_array(att.t)
-
- # Fall back to intializer since constant folding might have been applied.
- initializer = self.get_initializer(output_name)
- if initializer is not None:
- return numpy_helper.to_array(initializer)
-
- return None
-
- def get_constant_input(self, node):
- for i, input in enumerate(node.input):
- value = self.get_constant_value(input)
- if value is not None:
- return i, value
-
- return None, None
-
- def find_constant_input(self, node, expected_value, delta=0.000001):
- i, value = self.get_constant_input(node)
- if (
- value is not None
- and value.size == 1
- and abs(value - expected_value) < delta
- ):
- return i
-
- return -1
-
- def is_constant_with_specified_dimension(
- self, output_name, dimensions, description
- ):
- value = self.get_constant_value(output_name)
- if value is None:
- logger.debug(f"{description} {output_name} is not initializer.")
- return False
-
- if len(value.shape) != dimensions:
- logger.debug(
- f"{description} {output_name} shall have {dimensions} dimensions. Got shape {value.shape}"
- )
- return False
-
- return True
-
- def has_constant_input(self, node, expected_value, delta=0.000001):
- return self.find_constant_input(node, expected_value, delta) >= 0
-
- def get_children_subgraph_nodes(
- self, root_node, stop_nodes, input_name_to_nodes=None
- ):
- if input_name_to_nodes is None:
- input_name_to_nodes = self.input_name_to_nodes()
-
- children = input_name_to_nodes[root_node.output[0]]
-
- unique_nodes = []
-
- dq = deque(children)
- while len(dq) > 0:
- current_node = dq.pop()
- if current_node in stop_nodes:
- continue
-
- if current_node not in unique_nodes:
- unique_nodes.append(current_node)
-
- for output in current_node.output:
- if output in input_name_to_nodes:
- children = input_name_to_nodes[output]
- for child in children:
- dq.appendleft(child)
-
- return unique_nodes
-
- def tensor_shape_to_list(self, tensor_type):
- """Convert tensor shape to list"""
- shape_list = []
- for d in tensor_type.shape.dim:
- if d.HasField("dim_value"):
- shape_list.append(d.dim_value) # known dimension
- elif d.HasField("dim_param"):
- shape_list.append(d.dim_param) # unknown dimension with symbolic name
- else:
- shape_list.append("?") # shall not happen
- return shape_list
-
- def get_dtype(self, input_or_output: str):
- """Try get data type given a name (could be initializer, graph input or output)."""
- tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
-
- if input_or_output in tensor_type_map:
- return tensor_type_map[input_or_output].tensor_type.elem_type
-
- graph_input = self.find_graph_input(input_or_output)
- if graph_input:
- return graph_input.type.tensor_type.elem_type
-
- graph_output = self.find_graph_output(input_or_output)
- if graph_output:
- return graph_output.type.tensor_type.elem_type
-
- return None
-
- @staticmethod
- def get_node_attribute(node: NodeProto, attribute_name: str):
- for attr in node.attribute:
- if attr.name == attribute_name:
- value = helper.get_attribute_value(attr)
- return value
- return None
-
- def remove_cascaded_cast_nodes(self):
- """Remove Cast node that are followed by another Cast node like --> Cast --> Cast -->
- Note that this shall be used carefully since it might introduce semantic change.
- For example, float -> int -> float could get different value than the original float value.
- So, it is recommended to used only in post-processing of mixed precision conversion.
- """
- output_name_to_node = self.output_name_to_node()
- removed_count = 0
- for node in self.nodes():
- if node.op_type == "Cast":
- parent = self.get_parent(
- node, 0, output_name_to_node=output_name_to_node
- )
- if parent and parent.op_type == "Cast":
- node.input[0] = parent.input[0]
- removed_count += 1
-
- if removed_count > 0:
- logger.info("Removed %d cascaded Cast nodes", removed_count)
- self.prune_graph()
-
- def remove_useless_cast_nodes(self):
- """Remove cast nodes that are not needed: input and output has same data type."""
- shape_infer = self.infer_runtime_shape(update=True)
- if shape_infer is None:
- logger.info(
- f"Skip removing useless cast nodes since shape inference failed."
- )
- return
-
- def get_data_type(input_or_output_name):
- dtype = self.get_dtype(input_or_output_name)
- if dtype:
- return dtype
- if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField(
- "elem_type"
- ):
- return shape_infer.known_vi_[
- input_or_output_name
- ].type.tensor_type.elem_type
- return None
-
- nodes_to_remove = []
- for node in self.nodes():
- if node.op_type == "Cast":
- input_dtype = get_data_type(node.input[0])
- output_dtype = get_data_type(node.output[0])
- if input_dtype and input_dtype == output_dtype:
- nodes_to_remove.append(node)
-
- if nodes_to_remove:
- graph_input_names = set(self.get_graphs_input_names())
- graph_output_names = set(self.get_graphs_output_names())
- for node in nodes_to_remove:
- if bool(set(node.output) & graph_output_names):
- if not bool(set(node.input) & graph_input_names):
- self.replace_output_of_all_nodes(node.input[0], node.output[0])
- else:
- continue
- else:
- self.replace_input_of_all_nodes(node.output[0], node.input[0])
- self.remove_node(node)
-
- logger.info(
- "Removed %d Cast nodes with output type same as input",
- len(nodes_to_remove),
- )
-
- def convert_model_float32_to_float16(self, cast_input_output=True):
- logger.warning(
- "The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!"
- )
- self.convert_float_to_float16(
- use_symbolic_shape_infer=True, keep_io_types=cast_input_output
- )
-
- def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
- """Convert a model to half (default) or mixed precision.
- To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32.
- By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used.
- Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information.
-
- Args:
- use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True.
- keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
- If True, model inputs/outputs should be left as float32. Defaults to False.
- op_block_list (List[str], optional): List of operator types to leave as float32.
- Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
- node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
- force_fp16_initializers(bool): force converting all float initializers to float16.
- Default to false, which will convert only the one needed to avoid precision loss.
- min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
- max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
- """
- if "keep_io_types" not in kwargs:
- kwargs["keep_io_types"] = True
-
- model = self.model
- if use_symbolic_shape_infer:
- # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
- shape_infer_helper = SymbolicShapeInferenceHelper(model)
- model = shape_infer_helper.infer_shapes(
- model, auto_merge=True, guess_output_rank=False
- )
-
- parameters = {"disable_shape_infer": use_symbolic_shape_infer}
- parameters.update(
- {
- key: kwargs[key]
- for key in [
- "keep_io_types",
- "min_positive_val",
- "max_finite_val",
- "op_block_list",
- "node_block_list",
- "force_fp16_initializers",
- ]
- if key in kwargs
- }
- )
-
- fp16_model = convert_float_to_float16(model, **parameters)
- self.initialize(fp16_model)
-
- self.remove_cascaded_cast_nodes()
-
- self.remove_useless_cast_nodes()
-
- def create_node_name(self, op_type, name_prefix=None):
- """Create a unique node name that starts with a prefix (default is operator type).
- The name will not be duplicated with any name that generated or existed in current graphs.
- Args:
- op_type (str): operator type
- name_prefix (str, optional): prefix of node name. Defaults to None.
-
- Returns:
- str: node name
- """
-
- if name_prefix:
- prefix = name_prefix if name_prefix.endswith("_") else (name_prefix + "_")
- else:
- prefix = op_type + "_"
-
- suffix: int = 0
- if prefix in self._node_name_suffix:
- suffix = self._node_name_suffix[prefix] + 1
- else:
- # Check existed node name only once for a prefix as we assume create_node_name is called for every new node in fusion.
- for node in self.nodes():
- if node.name and node.name.startswith(prefix):
- try:
- index = int(node.name[len(prefix) :])
- suffix = max(index + 1, suffix)
- except ValueError:
- continue
-
- # Record the generated suffix so that we can avoid generating duplicated name.
- self._node_name_suffix[prefix] = suffix
-
- return prefix + str(suffix)
-
- def find_graph_input(self, input_name):
- for input in self.model.graph.input:
- if input.name == input_name:
- return input
- return None
-
- def find_graph_output(self, output_name):
- for output in self.model.graph.output:
- if output.name == output_name:
- return output
- return None
-
- def get_parent_subgraph_nodes(self, node, stop_nodes, output_name_to_node=None):
- if output_name_to_node is None:
- output_name_to_node = self.output_name_to_node()
-
- unique_nodes = []
-
- parents = self.get_parents(node, output_name_to_node)
- dq = deque(parents)
- while len(dq) > 0:
- current_node = dq.pop()
- if current_node in stop_nodes:
- continue
-
- if current_node not in unique_nodes:
- unique_nodes.append(current_node)
-
- for input in current_node.input:
- if input in output_name_to_node:
- dq.appendleft(output_name_to_node[input])
-
- return unique_nodes
-
- def get_graph_inputs(self, current_node, recursive=False):
- """
- Find graph inputs that linked to current node.
- """
- graph_inputs = []
- for input in current_node.input:
- if self.find_graph_input(input) and input not in graph_inputs:
- graph_inputs.append(input)
-
- if recursive:
- parent_nodes = self.get_parent_subgraph_nodes(current_node, [])
- for node in parent_nodes:
- for input in node.input:
- if self.find_graph_input(input) and input not in graph_inputs:
- graph_inputs.append(input)
- return graph_inputs
-
- @staticmethod
- def input_index(node_output, child_node):
- index = 0
- for input in child_node.input:
- if input == node_output:
- return index
- index += 1
- return -1
-
- def remove_unused_constant(self):
- input_name_to_nodes = self.input_name_to_nodes()
-
- # remove unused constant
- unused_nodes = []
- nodes = self.nodes()
- for node in nodes:
- if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
- unused_nodes.append(node)
-
- self.remove_nodes(unused_nodes)
-
- if len(unused_nodes) > 0:
- logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}")
-
- def prune_graph(self, outputs=None):
- """
- Prune graph to keep only required outputs. It removes unnecessary inputs and nodes.
- Nodes are not linked (directly or indirectly) to any required output will be removed.
-
- Args:
- outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept.
- """
- if len(self.graphs()) > 1:
- logger.debug(f"Skip prune_graph since graph has subgraph")
- return
-
- if outputs is None:
- outputs = [output.name for output in self.model.graph.output]
-
- output_name_to_node = self.output_name_to_node()
- all_nodes = []
- for output in outputs:
- if output in output_name_to_node:
- last_node = output_name_to_node[output]
- if last_node in all_nodes:
- continue
- nodes = self.get_parent_subgraph_nodes(last_node, [])
- all_nodes.append(last_node)
- all_nodes.extend(nodes)
-
- nodes_to_remove = []
- for node in self.model.graph.node:
- if node not in all_nodes:
- nodes_to_remove.append(node)
-
- self.remove_nodes(nodes_to_remove)
-
- # remove outputs not in list
- output_to_remove = []
- for output in self.model.graph.output:
- if output.name not in outputs:
- output_to_remove.append(output)
- for output in output_to_remove:
- self.model.graph.output.remove(output)
-
- # remove inputs not used by any node.
- input_name_to_nodes = self.input_name_to_nodes()
- input_to_remove = []
- for input in self.model.graph.input:
- if input.name not in input_name_to_nodes:
- input_to_remove.append(input)
- for input in input_to_remove:
- self.model.graph.input.remove(input)
-
- if input_to_remove or output_to_remove or nodes_to_remove:
- logger.info(
- "Graph pruned: {} inputs, {} outputs and {} nodes are removed".format(
- len(input_to_remove), len(output_to_remove), len(nodes_to_remove)
- )
- )
-
- self.update_graph()
-
- def update_graph(self, verbose=False):
- graph = self.model.graph
-
- remaining_input_names = []
- for node in graph.node:
- if node.op_type in ["Loop", "Scan", "If"]:
- # TODO: handle inner graph
- logger.debug(
- f"Skip update_graph since graph has operator: {node.op_type}"
- )
- return
- if node.op_type != "Constant":
- for input_name in node.input:
- if input_name not in remaining_input_names:
- remaining_input_names.append(input_name)
- if verbose:
- logger.debug(f"remaining input names: {remaining_input_names}")
-
- # remove graph input that is not used
- inputs_to_remove = []
- for input in graph.input:
- if input.name not in remaining_input_names:
- inputs_to_remove.append(input)
- for input in inputs_to_remove:
- graph.input.remove(input)
-
- names_to_remove = [input.name for input in inputs_to_remove]
- logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}")
-
- # remove weights that are not used
- weights_to_remove = []
- weights_to_keep = []
- for initializer in graph.initializer:
- if (
- initializer.name not in remaining_input_names
- and not self.find_graph_output(initializer.name)
- ):
- weights_to_remove.append(initializer)
- else:
- weights_to_keep.append(initializer.name)
- for initializer in weights_to_remove:
- graph.initializer.remove(initializer)
-
- names_to_remove = [initializer.name for initializer in weights_to_remove]
- logger.debug(
- f"remove {len(weights_to_remove)} unused initializers: {names_to_remove}"
- )
- if verbose:
- logger.debug(f"remaining initializers:{weights_to_keep}")
-
- self.remove_unused_constant()
-
- def is_safe_to_fuse_nodes(
- self, nodes_to_remove, keep_outputs, input_name_to_nodes, output_name_to_node
- ):
- for node_to_remove in nodes_to_remove:
- for output_to_remove in node_to_remove.output:
- if output_to_remove in keep_outputs:
- continue
-
- if output_to_remove in input_name_to_nodes:
- for impacted_node in input_name_to_nodes[output_to_remove]:
- if impacted_node not in nodes_to_remove:
- logger.debug(
- f"it is not safe to remove nodes since output {output_to_remove} is used by {impacted_node}"
- )
- return False
- return True
-
- @staticmethod
- def graph_topological_sort(graph):
- deps_count = [0] * len(graph.node) # dependency count of each node
- deps_to_nodes = {} # input to node indice
- sorted_nodes = [] # initialize sorted_nodes
- for node_idx, node in enumerate(graph.node):
- # CANNOT use len(node.input) directly because input can be optional
- deps_count[node_idx] = sum(1 for _ in node.input if _)
- if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
- sorted_nodes.append(graph.node[node_idx])
- continue
-
- for input_name in node.input:
- if input_name not in deps_to_nodes:
- deps_to_nodes[input_name] = [node_idx]
- else:
- deps_to_nodes[input_name].append(node_idx)
-
- # Note: this logic only applies to top level graph since a sub graph could use intializer from parent graph
- initializer_names = [init.name for init in graph.initializer]
- graph_input_names = [input.name for input in graph.input]
- input_names = initializer_names + graph_input_names
- input_names.sort()
- prev_input_name = None
- for input_name in input_names:
- if prev_input_name == input_name:
- continue
-
- prev_input_name = input_name
- if input_name in deps_to_nodes:
- for node_idx in deps_to_nodes[input_name]:
- deps_count[node_idx] = deps_count[node_idx] - 1
- if deps_count[node_idx] == 0:
- sorted_nodes.append(graph.node[node_idx])
-
- start = 0
- end = len(sorted_nodes)
-
- while start < end:
- for output in sorted_nodes[start].output:
- if output in deps_to_nodes:
- for node_idx in deps_to_nodes[output]:
- deps_count[node_idx] = deps_count[node_idx] - 1
- if deps_count[node_idx] == 0:
- sorted_nodes.append(graph.node[node_idx])
- end = end + 1
- start = start + 1
-
- if end != len(graph.node):
- raise RuntimeError(
- f"Graph is not a DAG: end={end}, len(graph.node)={len(graph.node)}, graph.node[end]={graph.node[end]}"
- )
-
- graph.ClearField("node")
- graph.node.extend(sorted_nodes)
-
- def topological_sort(self):
- # TODO: support graph_topological_sort() in subgraphs
- # for graph in self.graphs():
- # self.graph_topological_sort(graph)
- OnnxModel.graph_topological_sort(self.model.graph)
-
- @staticmethod
- def save(
- model,
- output_path,
- save_as_external_data=False,
- all_tensors_to_one_file=True,
- size_threshold=1024,
- convert_attribute=False,
- ):
- Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
- if save_as_external_data:
- # Save model to external data, which is needed for model size > 2GB
- output_dir = Path(output_path).parent
- output_dir.mkdir(parents=True, exist_ok=True)
- external_data_path = output_path + ".data"
- location = (
- Path(external_data_path).name if all_tensors_to_one_file else None
- )
-
- if os.path.exists(output_path):
- logger.info(f"Delete the existed onnx file: {output_path}")
- os.remove(output_path)
-
- if all_tensors_to_one_file:
- if os.path.exists(external_data_path):
- # Delete the external data file. Otherwise, data will be appended to existing file.
- logger.info(
- f"Delete the existed external data file: {external_data_path}"
- )
- os.remove(external_data_path)
- else:
- if os.listdir(output_dir):
- raise RuntimeError(
- f"Output directory ({output_dir}) for external data is not empty."
- )
-
- save_model(
- model,
- output_path,
- save_as_external_data=True,
- all_tensors_to_one_file=all_tensors_to_one_file,
- location=location,
- size_threshold=size_threshold,
- convert_attribute=convert_attribute,
- )
- else:
- save_model(model, output_path)
-
- def save_model_to_file(
- self, output_path, use_external_data_format=False, all_tensors_to_one_file=True
- ):
- logger.info(f"Sort graphs in topological order")
- self.topological_sort()
-
- if output_path.endswith(".json"): # Output text for testing small model.
- with open(output_path, "w") as out:
- out.write(str(model))
- else:
- OnnxModel.save(
- self.model,
- output_path,
- use_external_data_format,
- all_tensors_to_one_file,
- )
- logger.info(f"Model saved to {output_path}")
-
- def get_graph_inputs_excluding_initializers(self):
- """
- Returns real graph inputs (excluding initializers from older onnx model).
- """
- graph_inputs = []
- for input in self.model.graph.input:
- if self.get_initializer(input.name) is None:
- graph_inputs.append(input)
- return graph_inputs
-
- def get_opset_version(self):
- """Get opset version of onnx domain
-
- Raises:
- RuntimeError: ONNX model has no opset for default domain.
-
- Returns:
- int: opset version of onnx domain.
- """
- for opset in self.model.opset_import:
- if opset.domain in ["", "ai.onnx"]:
- return opset.version
- raise RuntimeError("ONNX model has no opset for default domain")
-
- @staticmethod
- def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
- """Returns True when two tensors have same value.
- Note that name can be different.
-
- Args:
- tensor1 (TensorProto): initializer 1
- tensor2 (TensorProto): initializer 2
-
- Returns:
- bool: True when two intializers has same value.
- """
- if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims:
- return False
- if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
- return tensor1.raw_data == tensor2.raw_data
- return numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)
-
- def remove_duplicated_initializer(self):
- """Remove initializers with duplicated values, and only keep the first one.
- It could help reduce size of models (like ALBert) with shared weights.
- Note: this function does not process subgraph.
- """
- if len(self.graphs()) > 1:
- logger.warning("remove_duplicated_initializer does not process subgraphs.")
-
- initializer_count = len(self.model.graph.initializer)
-
- same = [-1] * initializer_count
- for i in range(initializer_count - 1):
- if same[i] >= 0:
- continue
- for j in range(i + 1, initializer_count):
- if OnnxModel.has_same_value(
- self.model.graph.initializer[i], self.model.graph.initializer[j]
- ):
- same[j] = i
-
- count = 0
- for i in range(initializer_count):
- if same[i] >= 0:
- count += 1
- self.replace_input_of_all_nodes(
- self.model.graph.initializer[i].name,
- self.model.graph.initializer[same[i]].name,
- )
-
- if count > 0:
- self.update_graph()
- print(f"Removed {count} initializers with duplicated value")
-
- def add_prefix_to_names(self, prefix: str):
- """Add prefix to initializer or intermediate outputs in graph. Main graph inputs and outputs are excluded.
- It could help avoid conflicting in name of node_args when merging two graphs.
- Note: this function does not process subgraph.
- """
- if len(self.graphs()) > 1:
- logger.warning("add_prefix_to_names does not process subgraphs.")
-
- # Exclude the names of inputs and outputs of main graph (but not subgraphs)
- excluded = [i.name for i in self.model.graph.input] + [
- o.name for o in self.model.graph.output
- ]
-
- for initializer in self.model.graph.initializer:
- if initializer.name not in excluded:
- if prefix + initializer.name not in excluded:
- initializer.name = prefix + initializer.name
-
- for node in self.model.graph.node:
- # update name of node inputs
- for j in range(len(node.input)):
- if node.input[j] not in excluded:
- if prefix + node.input[j] not in excluded:
- node.input[j] = prefix + node.input[j]
-
- # update name of node outputs
- for j in range(len(node.output)):
- if node.output[j] not in excluded:
- if prefix + node.output[j] not in excluded:
- node.output[j] = prefix + node.output[j]
-
- for value_info in self.model.graph.value_info:
- if value_info.name not in excluded:
- value_info.name = prefix + value_info.name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
deleted file mode 100644
index a48b53db83fa675713cd9e4ac3b38d2ed554a73b..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-import logging
-import os
-import sys
-from typing import Dict
-
-# In ORT Package the symbolic_shape_infer.py is in ../tools
-file_path = os.path.dirname(__file__)
-if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")):
- sys.path.append(os.path.join(file_path, "../tools"))
-else:
- sys.path.append(os.path.join(file_path, ".."))
-
-from .symbolic_shape_infer import (
- SymbolicShapeInference,
- get_shape_from_type_proto,
- sympy,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class SymbolicShapeInferenceHelper(SymbolicShapeInference):
- def __init__(
- self,
- model,
- verbose=0,
- int_max=2**31 - 1,
- auto_merge=True,
- guess_output_rank=False,
- ):
- super().__init__(int_max, auto_merge, guess_output_rank, verbose)
- self.model_ = model
- self.all_shapes_inferred_: bool = False
- self.is_inferred_: bool = False
- self.dynamic_axis_mapping_: Dict[str, int] = {}
-
- def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128):
- """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided.
-
- Args:
- dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4}
- max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32.
-
- Returns:
- bool: whether all shapes has been inferred or not.
- """
- assert dynamic_axis_mapping is not None
-
- if self.is_inferred_ and self.dynamic_axis_mapping_ == dynamic_axis_mapping:
- return self.all_shapes_inferred_
-
- self.dynamic_axis_mapping_ = dynamic_axis_mapping
-
- self._preprocess(self.model_)
-
- count = 0
- while self.run_:
- logger.debug(f"shape infer run {count}")
- self.all_shapes_inferred_ = self._infer_impl()
- count += 1
- if max_runs > 0 and count >= max_runs:
- break
-
- self.is_inferred_ = True
- return self.all_shapes_inferred_
-
- def _get_sympy_shape(self, node, idx):
- """Override it to ensure shape inference by giving the actual value of dynamic axis."""
- sympy_shape = []
-
- shape = self._get_shape(node, idx)
- if shape:
- for dim in shape:
- if isinstance(dim, str):
- if dim in self.dynamic_axis_mapping_:
- sympy_shape.append(self.dynamic_axis_mapping_[dim])
- elif dim in self.symbolic_dims_:
- sympy_shape.append(self.symbolic_dims_[dim])
- else:
- sympy_shape.append(sympy.Symbol(dim, integer=True))
- else:
- assert dim is not None
- sympy_shape.append(dim)
- return sympy_shape
-
- def get_edge_shape(self, edge):
- """Get shape of an edge.
-
- Args:
- edge (str): name of edge
-
- Returns:
- Optional[List[int]]: the shape, or None if shape is unknown
- """
- assert self.all_shapes_inferred_
- if edge not in self.known_vi_:
- print("Cannot retrieve the shape of " + str(edge))
- return None
-
- type_proto = self.known_vi_[edge].type
- shape = get_shape_from_type_proto(type_proto)
-
- if shape is not None:
- for i, dim in enumerate(shape):
- if isinstance(dim, str) and dim in self.dynamic_axis_mapping_:
- shape[i] = self.dynamic_axis_mapping_[dim]
-
- return shape
-
- def compare_shape(self, edge, edge_other):
- """Compare shape of two edges.
-
- Args:
- edge (str): name of edge
- edge_other (str): name of another edge
-
- Raises:
- Exception: At least one shape is missed for edges to compare
-
- Returns:
- bool: whether the shape is same or not
- """
- assert self.all_shapes_inferred_
- shape = self.get_edge_shape(edge)
- shape_other = self.get_edge_shape(edge_other)
- if shape is None or shape_other is None:
- raise Exception("At least one shape is missed for edges to compare")
- return shape == shape_other
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
deleted file mode 100644
index 2311ad57fdefa502a9e6d7edf44dc884c843ee51..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
+++ /dev/null
@@ -1,2805 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-# -*- coding: UTF-8 -*-
-import argparse
-import logging
-
-import numpy as np
-import onnx
-import sympy
-from onnx import helper, numpy_helper, shape_inference
-from packaging import version
-
-assert version.parse(onnx.__version__) >= version.parse("1.8.0")
-
-logger = logging.getLogger(__name__)
-
-
-def get_attribute(node, attr_name, default_value=None):
- found = [attr for attr in node.attribute if attr.name == attr_name]
- if found:
- return helper.get_attribute_value(found[0])
- return default_value
-
-
-def get_dim_from_proto(dim):
- return (
- getattr(dim, dim.WhichOneof("value"))
- if type(dim.WhichOneof("value")) == str
- else None
- )
-
-
-def is_sequence(type_proto):
- cls_type = type_proto.WhichOneof("value")
- assert cls_type in ["tensor_type", "sequence_type"]
- return cls_type == "sequence_type"
-
-
-def get_shape_from_type_proto(type_proto):
- assert not is_sequence(type_proto)
- if type_proto.tensor_type.HasField("shape"):
- return [get_dim_from_proto(d) for d in type_proto.tensor_type.shape.dim]
- else:
- return None # note no shape is different from shape without dim (scalar)
-
-
-def get_shape_from_value_info(vi):
- cls_type = vi.type.WhichOneof("value")
- if cls_type is None:
- return None
- if is_sequence(vi.type):
- if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"):
- return get_shape_from_type_proto(vi.type.sequence_type.elem_type)
- else:
- return None
- else:
- return get_shape_from_type_proto(vi.type)
-
-
-def make_named_value_info(name):
- vi = onnx.ValueInfoProto()
- vi.name = name
- return vi
-
-
-def get_shape_from_sympy_shape(sympy_shape):
- return [
- None if i is None else (int(i) if is_literal(i) else str(i))
- for i in sympy_shape
- ]
-
-
-def is_literal(dim):
- return type(dim) in [int, np.int64, np.int32, sympy.Integer] or (
- hasattr(dim, "is_number") and dim.is_number
- )
-
-
-def handle_negative_axis(axis, rank):
- assert axis < rank and axis >= -rank
- return axis if axis >= 0 else rank + axis
-
-
-def get_opset(mp, domain=None):
- domain = domain or ["", "onnx", "ai.onnx"]
- if type(domain) != list:
- domain = [domain]
- for opset in mp.opset_import:
- if opset.domain in domain:
- return opset.version
-
- return None
-
-
-def as_scalar(x):
- if type(x) == list:
- assert len(x) == 1
- return x[0]
- elif type(x) == np.ndarray:
- return x.item()
- else:
- return x
-
-
-def as_list(x, keep_none):
- if type(x) == list:
- return x
- elif type(x) == np.ndarray:
- return list(x)
- elif keep_none and x is None:
- return None
- else:
- return [x]
-
-
-def sympy_reduce_product(x):
- if type(x) == list:
- value = sympy.Integer(1)
- for v in x:
- value = value * v
- else:
- value = x
- return value
-
-
-class SymbolicShapeInference:
- def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
- self.dispatcher_ = {
- "Add": self._infer_symbolic_compute_ops,
- "ArrayFeatureExtractor": self._infer_ArrayFeatureExtractor,
- "AveragePool": self._infer_Pool,
- "BatchNormalization": self._infer_BatchNormalization,
- "Cast": self._infer_Cast,
- "CategoryMapper": self._infer_CategoryMapper,
- "Compress": self._infer_Compress,
- "Concat": self._infer_Concat,
- "ConcatFromSequence": self._infer_ConcatFromSequence,
- "Constant": self._infer_Constant,
- "ConstantOfShape": self._infer_ConstantOfShape,
- "Conv": self._infer_Conv,
- "CumSum": self._pass_on_shape_and_type,
- "Div": self._infer_symbolic_compute_ops,
- "Einsum": self._infer_Einsum,
- "Expand": self._infer_Expand,
- "Equal": self._infer_symbolic_compute_ops,
- "Floor": self._infer_symbolic_compute_ops,
- "Gather": self._infer_Gather,
- "GatherElements": self._infer_GatherElements,
- "GatherND": self._infer_GatherND,
- "Identity": self._pass_on_shape_and_type,
- "If": self._infer_If,
- "Loop": self._infer_Loop,
- "MatMul": self._infer_MatMul,
- "MatMulInteger16": self._infer_MatMulInteger,
- "MaxPool": self._infer_Pool,
- "Max": self._infer_symbolic_compute_ops,
- "Min": self._infer_symbolic_compute_ops,
- "Mul": self._infer_symbolic_compute_ops,
- "NonMaxSuppression": self._infer_NonMaxSuppression,
- "NonZero": self._infer_NonZero,
- "OneHot": self._infer_OneHot,
- "Pad": self._infer_Pad,
- "Range": self._infer_Range,
- "Reciprocal": self._pass_on_shape_and_type,
- "ReduceSum": self._infer_ReduceSum,
- "ReduceProd": self._infer_ReduceProd,
- "Reshape": self._infer_Reshape,
- "Resize": self._infer_Resize,
- "Round": self._pass_on_shape_and_type,
- "Scan": self._infer_Scan,
- "ScatterElements": self._infer_ScatterElements,
- "SequenceAt": self._infer_SequenceAt,
- "SequenceInsert": self._infer_SequenceInsert,
- "Shape": self._infer_Shape,
- "Size": self._infer_Size,
- "Slice": self._infer_Slice,
- "SoftmaxCrossEntropyLoss": self._infer_SoftmaxCrossEntropyLoss,
- "SoftmaxCrossEntropyLossInternal": self._infer_SoftmaxCrossEntropyLoss,
- "NegativeLogLikelihoodLossInternal": self._infer_SoftmaxCrossEntropyLoss,
- "Split": self._infer_Split,
- "SplitToSequence": self._infer_SplitToSequence,
- "Squeeze": self._infer_Squeeze,
- "Sub": self._infer_symbolic_compute_ops,
- "Tile": self._infer_Tile,
- "TopK": self._infer_TopK,
- "Transpose": self._infer_Transpose,
- "Unsqueeze": self._infer_Unsqueeze,
- "Where": self._infer_symbolic_compute_ops,
- "ZipMap": self._infer_ZipMap,
- "Neg": self._infer_symbolic_compute_ops,
- # contrib ops:
- "Attention": self._infer_Attention,
- "BiasGelu": self._infer_BiasGelu,
- "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
- "FastGelu": self._infer_FastGelu,
- "Gelu": self._infer_Gelu,
- "LayerNormalization": self._infer_LayerNormalization,
- "LongformerAttention": self._infer_LongformerAttention,
- "PythonOp": self._infer_PythonOp,
- "SkipLayerNormalization": self._infer_SkipLayerNormalization,
- }
- self.aten_op_dispatcher_ = {
- "embedding": self._infer_Gather,
- "bitwise_or": self._infer_aten_bitwise_or,
- "diagonal": self._infer_aten_diagonal,
- "max_pool2d_with_indices": self._infer_aten_pool2d,
- "max": self._infer_aten_minmax,
- "min": self._infer_aten_minmax,
- "multinomial": self._infer_aten_multinomial,
- "unfold": self._infer_aten_unfold,
- "argmax": self._infer_aten_argmax,
- "avg_pool2d": self._infer_aten_pool2d,
- "_adaptive_avg_pool2d": self._infer_aten_pool2d,
- "numpy_T": self._infer_Transpose,
- }
- self.run_ = True
- self.suggested_merge_ = {}
- self.symbolic_dims_ = {}
- self.input_symbols_ = {}
- self.auto_merge_ = auto_merge
- self.guess_output_rank_ = guess_output_rank
- self.verbose_ = verbose
- self.int_max_ = int_max
- self.subgraph_id_ = 0
- self.prefix_ = prefix
-
- def _add_suggested_merge(self, symbols, apply=False):
- assert all(
- [
- (type(s) == str and s in self.symbolic_dims_) or is_literal(s)
- for s in symbols
- ]
- )
- symbols = set(symbols)
- for k, v in self.suggested_merge_.items():
- if k in symbols:
- symbols.remove(k)
- symbols.add(v)
- map_to = None
- # if there is literal, map to it first
- for s in symbols:
- if is_literal(s):
- map_to = s
- break
- # when no literals, map to input symbolic dims, then existing symbolic dims
- if map_to is None:
- for s in symbols:
- if s in self.input_symbols_:
- map_to = s
- break
- if map_to is None:
- for s in symbols:
- if type(self.symbolic_dims_[s]) == sympy.Symbol:
- map_to = s
- break
- # when nothing to map to, use the shorter one
- if map_to is None:
- if self.verbose_ > 0:
- logger.warning(
- "Potential unsafe merge between symbolic expressions: ({})".format(
- ",".join(symbols)
- )
- )
- symbols_list = list(symbols)
- lens = [len(s) for s in symbols_list]
- map_to = symbols_list[lens.index(min(lens))]
- symbols.remove(map_to)
-
- for s in symbols:
- if s == map_to:
- continue
- if is_literal(map_to) and is_literal(s):
- assert int(map_to) == int(s)
- self.suggested_merge_[s] = int(map_to) if is_literal(map_to) else map_to
- for k, v in self.suggested_merge_.items():
- if v == s:
- self.suggested_merge_[k] = map_to
- if apply and self.auto_merge_:
- self._apply_suggested_merge()
-
- def _apply_suggested_merge(self, graph_input_only=False):
- if not self.suggested_merge_:
- return
- for i in list(self.out_mp_.graph.input) + (
- [] if graph_input_only else list(self.out_mp_.graph.value_info)
- ):
- for d in i.type.tensor_type.shape.dim:
- if d.dim_param in self.suggested_merge_:
- v = self.suggested_merge_[d.dim_param]
- if is_literal(v):
- d.dim_value = int(v)
- else:
- d.dim_param = v
-
- def _preprocess(self, in_mp):
- self.out_mp_ = onnx.ModelProto()
- self.out_mp_.CopyFrom(in_mp)
- self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
- self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
- self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
- self.known_vi_.update(
- dict(
- [
- (
- i.name,
- helper.make_tensor_value_info(
- i.name, i.data_type, list(i.dims)
- ),
- )
- for i in self.out_mp_.graph.initializer
- ]
- )
- )
-
- def _merge_symbols(self, dims):
- if not all([type(d) == str for d in dims]):
- if self.auto_merge_:
- unique_dims = list(set(dims))
- is_int = [is_literal(d) for d in unique_dims]
- assert (
- sum(is_int) <= 1
- ) # if there are more than 1 unique ints, something is wrong
- if sum(is_int) == 1:
- int_dim = is_int.index(1)
- if self.verbose_ > 0:
- logger.debug(
- "dim {} has been merged with value {}".format(
- unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
- unique_dims[int_dim],
- )
- )
- self._check_merged_dims(unique_dims, allow_broadcast=False)
- return unique_dims[int_dim]
- else:
- if self.verbose_ > 0:
- logger.debug(
- "dim {} has been mergd with dim {}".format(
- unique_dims[1:], unique_dims[0]
- )
- )
- return dims[0]
- else:
- return None
- if all([d == dims[0] for d in dims]):
- return dims[0]
- merged = [
- self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims
- ]
- if all([d == merged[0] for d in merged]):
- assert merged[0] in self.symbolic_dims_
- return merged[0]
- else:
- return None
-
- # broadcast from right to left, and merge symbolic dims if needed
- def _broadcast_shapes(self, shape1, shape2):
- new_shape = []
- rank1 = len(shape1)
- rank2 = len(shape2)
- new_rank = max(rank1, rank2)
- for i in range(new_rank):
- dim1 = shape1[rank1 - 1 - i] if i < rank1 else 1
- dim2 = shape2[rank2 - 1 - i] if i < rank2 else 1
- if dim1 == 1 or dim1 == dim2:
- new_dim = dim2
- elif dim2 == 1:
- new_dim = dim1
- else:
- new_dim = self._merge_symbols([dim1, dim2])
- if not new_dim:
- # warning about unsupported broadcast when not auto merge
- # note that auto merge has the risk of incorrectly merge symbols while one of them being 1
- # for example, 'a' = 1, 'b' = 5 at runtime is valid broadcasting, but with auto merge 'a' == 'b'
- if self.auto_merge_:
- self._add_suggested_merge([dim1, dim2], apply=True)
- else:
- logger.warning(
- "unsupported broadcast between "
- + str(dim1)
- + " "
- + str(dim2)
- )
- new_shape = [new_dim] + new_shape
- return new_shape
-
- def _get_shape(self, node, idx):
- name = node.input[idx]
- if name in self.known_vi_:
- vi = self.known_vi_[name]
- return get_shape_from_value_info(vi)
- else:
- assert name in self.initializers_
- return list(self.initializers_[name].dims)
-
- def _get_shape_rank(self, node, idx):
- return len(self._get_shape(node, idx))
-
- def _get_sympy_shape(self, node, idx):
- sympy_shape = []
- for d in self._get_shape(node, idx):
- if type(d) == str:
- sympy_shape.append(
- self.symbolic_dims_[d]
- if d in self.symbolic_dims_
- else sympy.Symbol(d, integer=True, nonnegative=True)
- )
- else:
- assert None != d
- sympy_shape.append(d)
- return sympy_shape
-
- def _get_value(self, node, idx):
- name = node.input[idx]
- assert name in self.sympy_data_ or name in self.initializers_
- return (
- self.sympy_data_[name]
- if name in self.sympy_data_
- else numpy_helper.to_array(self.initializers_[name])
- )
-
- def _try_get_value(self, node, idx):
- if idx >= len(node.input):
- return None
- name = node.input[idx]
- if name in self.sympy_data_ or name in self.initializers_:
- return self._get_value(node, idx)
- return None
-
- def _update_computed_dims(self, new_sympy_shape):
- for i, new_dim in enumerate(new_sympy_shape):
- if not is_literal(new_dim) and not type(new_dim) == str:
- str_dim = str(new_dim)
- if str_dim in self.suggested_merge_:
- if is_literal(self.suggested_merge_[str_dim]):
- continue # no need to create dim for literals
- new_sympy_shape[i] = self.symbolic_dims_[
- self.suggested_merge_[str_dim]
- ]
- else:
- # add new_dim if it's a computational expression
- if not str(new_dim) in self.symbolic_dims_:
- self.symbolic_dims_[str(new_dim)] = new_dim
-
- def _onnx_infer_single_node(self, node):
- # skip onnx shape inference for some ops, as they are handled in _infer_*
- skip_infer = node.op_type in [
- "If",
- "Loop",
- "Scan",
- "SplitToSequence",
- "ZipMap", # contrib ops
- "Attention",
- "BiasGelu",
- "EmbedLayerNormalization",
- "FastGelu",
- "Gelu",
- "LayerNormalization",
- "LongformerAttention",
- "SkipLayerNormalization",
- "PythonOp",
- ]
-
- if not skip_infer:
- # Only pass initializers that satisfy the following condition:
- # (1) Operator need value of some input for shape inference.
- # For example, Unsqueeze in opset 13 uses the axes input to calculate shape of output.
- # (2) opset version >= 9. In older version, initializer is required in graph input by onnx spec.
- # (3) The initializer is not in graph input. The means the node input is "constant" in inference.
- initializers = []
- if (get_opset(self.out_mp_) >= 9) and node.op_type in ["Unsqueeze"]:
- initializers = [
- self.initializers_[name]
- for name in node.input
- if (name in self.initializers_ and name not in self.graph_inputs_)
- ]
-
- # run single node inference with self.known_vi_ shapes
- tmp_graph = helper.make_graph(
- [node],
- "tmp",
- [self.known_vi_[i] for i in node.input if i],
- [make_named_value_info(i) for i in node.output],
- initializers,
- )
-
- self.tmp_mp_.graph.CopyFrom(tmp_graph)
-
- self.tmp_mp_ = shape_inference.infer_shapes(self.tmp_mp_)
-
- for i_o in range(len(node.output)):
- o = node.output[i_o]
- vi = self.out_mp_.graph.value_info.add()
- if not skip_infer:
- vi.CopyFrom(self.tmp_mp_.graph.output[i_o])
- else:
- vi.name = o
- self.known_vi_[o] = vi
-
- def _onnx_infer_subgraph(
- self, node, subgraph, use_node_input=True, inc_subgraph_id=True
- ):
- if self.verbose_ > 2:
- logger.debug(
- "Inferencing subgraph of node {} with output({}...): {}".format(
- node.name, node.output[0], node.op_type
- )
- )
- # node inputs are not passed directly to the subgraph
- # it's up to the node dispatcher to prepare subgraph input
- # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
- # besides, inputs in subgraph could shadow implicit inputs
- subgraph_inputs = set(
- [i.name for i in list(subgraph.initializer) + list(subgraph.input)]
- )
- subgraph_implicit_input = set(
- [name for name in self.known_vi_.keys() if not name in subgraph_inputs]
- )
- tmp_graph = helper.make_graph(
- list(subgraph.node),
- "tmp",
- list(subgraph.input) + [self.known_vi_[i] for i in subgraph_implicit_input],
- [make_named_value_info(i.name) for i in subgraph.output],
- )
- tmp_graph.initializer.extend(
- [
- i
- for i in self.out_mp_.graph.initializer
- if i.name in subgraph_implicit_input
- ]
- )
- tmp_graph.initializer.extend(subgraph.initializer)
- self.tmp_mp_.graph.CopyFrom(tmp_graph)
-
- symbolic_shape_inference = SymbolicShapeInference(
- self.int_max_,
- self.auto_merge_,
- self.guess_output_rank_,
- self.verbose_,
- prefix=self.prefix_ + "_" + str(self.subgraph_id_),
- )
- if inc_subgraph_id:
- self.subgraph_id_ += 1
-
- all_shapes_inferred = False
- symbolic_shape_inference._preprocess(self.tmp_mp_)
- symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
- while symbolic_shape_inference.run_:
- all_shapes_inferred = symbolic_shape_inference._infer_impl(
- self.sympy_data_.copy()
- )
- symbolic_shape_inference._update_output_from_vi()
- if use_node_input:
- # if subgraph uses node input, it needs to update to merged dims
- subgraph.ClearField("input")
- subgraph.input.extend(
- symbolic_shape_inference.out_mp_.graph.input[: len(node.input)]
- )
- subgraph.ClearField("output")
- subgraph.output.extend(symbolic_shape_inference.out_mp_.graph.output)
- subgraph.ClearField("value_info")
- subgraph.value_info.extend(symbolic_shape_inference.out_mp_.graph.value_info)
- subgraph.ClearField("node")
- subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node)
- # for new symbolic dims from subgraph output, add to main graph symbolic dims
- subgraph_shapes = [
- get_shape_from_value_info(o)
- for o in symbolic_shape_inference.out_mp_.graph.output
- ]
- subgraph_new_symbolic_dims = set(
- [
- d
- for s in subgraph_shapes
- if s
- for d in s
- if type(d) == str and not d in self.symbolic_dims_
- ]
- )
- new_dims = {}
- for d in subgraph_new_symbolic_dims:
- assert d in symbolic_shape_inference.symbolic_dims_
- new_dims[d] = symbolic_shape_inference.symbolic_dims_[d]
- self.symbolic_dims_.update(new_dims)
- return symbolic_shape_inference
-
- def _get_int_values(self, node, broadcast=False):
- values = [self._try_get_value(node, i) for i in range(len(node.input))]
- if all([v is not None for v in values]):
- # some shape compute is in floating point, cast to int for sympy
- for i, v in enumerate(values):
- if type(v) != np.ndarray:
- continue
- if len(v.shape) > 1:
- new_v = None # ignore value for rank > 1
- elif len(v.shape) == 0:
- new_v = int(v.item())
- else:
- assert len(v.shape) == 1
- new_v = [int(vv) for vv in v]
- values[i] = new_v
- values_len = [len(v) if type(v) == list else 0 for v in values]
- max_len = max(values_len)
- if max_len >= 1 and broadcast:
- # broadcast
- for i, v in enumerate(values):
- if v is None:
- continue # don't broadcast if value is unknown
- if type(v) == list:
- if len(v) < max_len:
- values[i] = v * max_len
- else:
- assert len(v) == max_len
- else:
- values[i] = [v] * max_len
- return values
-
- def _compute_on_sympy_data(self, node, op_func):
- assert len(node.output) == 1
- values = self._get_int_values(node, broadcast=True)
- if all([v is not None for v in values]):
- is_list = [type(v) == list for v in values]
- as_list = any(is_list)
- if as_list:
- self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)]
- else:
- self.sympy_data_[node.output[0]] = op_func(values)
-
- def _pass_on_sympy_data(self, node):
- assert len(node.input) == 1 or node.op_type in [
- "Reshape",
- "Unsqueeze",
- "Squeeze",
- ]
- self._compute_on_sympy_data(node, lambda x: x[0])
-
- def _pass_on_shape_and_type(self, node):
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- self._get_shape(node, 0),
- )
- )
-
- def _new_symbolic_dim(self, prefix, dim):
- new_dim = "{}_d{}".format(prefix, dim)
- if new_dim in self.suggested_merge_:
- v = self.suggested_merge_[new_dim]
- new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v
- else:
- new_symbolic_dim = sympy.Symbol(new_dim, integer=True, nonnegative=True)
- self.symbolic_dims_[new_dim] = new_symbolic_dim
- return new_symbolic_dim
-
- def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
- return self._new_symbolic_dim(
- "{}{}_{}_o{}_".format(
- node.op_type,
- self.prefix_,
- list(self.out_mp_.graph.node).index(node),
- out_idx,
- ),
- dim,
- )
-
- def _new_symbolic_shape(self, rank, node, out_idx=0):
- return [
- self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)
- ]
-
- def _compute_conv_pool_shape(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- if len(node.input) > 1:
- W_shape = self._get_sympy_shape(node, 1)
- rank = len(W_shape) - 2 # number of spatial axes
- kernel_shape = W_shape[-rank:]
- sympy_shape[1] = W_shape[0]
- else:
- W_shape = None
- kernel_shape = get_attribute(node, "kernel_shape")
- rank = len(kernel_shape)
-
- assert len(sympy_shape) == rank + 2
-
- # only need to symbolic shape inference if input has symbolic dims in spatial axes
- is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]]
-
- if not any(is_symbolic_dims):
- shape = get_shape_from_value_info(self.known_vi_[node.output[0]])
- if len(shape) > 0:
- assert len(sympy_shape) == len(shape)
- sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]]
- return sympy_shape
-
- dilations = get_attribute(node, "dilations", [1] * rank)
- strides = get_attribute(node, "strides", [1] * rank)
- effective_kernel_shape = [
- (k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)
- ]
- pads = get_attribute(node, "pads")
- if pads is None:
- pads = [0] * (2 * rank)
- auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8")
- if auto_pad != "VALID" and auto_pad != "NOTSET":
- try:
- residual = [
- sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides)
- ]
- total_pads = [
- max(0, (k - s) if r == 0 else (k - r))
- for k, s, r in zip(effective_kernel_shape, strides, residual)
- ]
- except TypeError: # sympy may throw TypeError: cannot determine truth value of Relational
- total_pads = [
- max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides)
- ] # assuming no residual if sympy throws error
- elif auto_pad == "VALID":
- total_pads = []
- else:
- total_pads = [0] * rank
- else:
- assert len(pads) == 2 * rank
- total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])]
-
- ceil_mode = get_attribute(node, "ceil_mode", 0)
- for i in range(rank):
- effective_input_size = sympy_shape[-rank + i]
- if len(total_pads) > 0:
- effective_input_size = effective_input_size + total_pads[i]
- if ceil_mode:
- strided_kernel_positions = sympy.ceiling(
- (effective_input_size - effective_kernel_shape[i]) / strides[i]
- )
- else:
- strided_kernel_positions = (
- effective_input_size - effective_kernel_shape[i]
- ) // strides[i]
- sympy_shape[-rank + i] = strided_kernel_positions + 1
- return sympy_shape
-
- def _check_merged_dims(self, dims, allow_broadcast=True):
- if allow_broadcast:
- dims = [d for d in dims if not (is_literal(d) and int(d) <= 1)]
- if not all([d == dims[0] for d in dims]):
- self._add_suggested_merge(dims, apply=True)
-
- def _compute_matmul_shape(self, node, output_dtype=None):
- lhs_shape = self._get_shape(node, 0)
- rhs_shape = self._get_shape(node, 1)
- lhs_rank = len(lhs_shape)
- rhs_rank = len(rhs_shape)
- lhs_reduce_dim = 0
- rhs_reduce_dim = 0
- assert lhs_rank > 0 and rhs_rank > 0
- if lhs_rank == 1 and rhs_rank == 1:
- new_shape = []
- elif lhs_rank == 1:
- rhs_reduce_dim = -2
- new_shape = rhs_shape[:rhs_reduce_dim] + [rhs_shape[-1]]
- elif rhs_rank == 1:
- lhs_reduce_dim = -1
- new_shape = lhs_shape[:lhs_reduce_dim]
- else:
- lhs_reduce_dim = -1
- rhs_reduce_dim = -2
- new_shape = (
- self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2])
- + [lhs_shape[-2]]
- + [rhs_shape[-1]]
- )
- # merge reduce dim
- self._check_merged_dims(
- [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
- allow_broadcast=False,
- )
- if output_dtype is None:
- # infer output_dtype from input type when not specified
- output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(node.output[0], output_dtype, new_shape)
- )
-
- def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
- """
- update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches
- """
- dst_tensor_type = (
- dst_type.sequence_type.elem_type.tensor_type
- if is_sequence(dst_type)
- else dst_type.tensor_type
- )
- src_tensor_type = (
- src_type.sequence_type.elem_type.tensor_type
- if is_sequence(src_type)
- else src_type.tensor_type
- )
- if dst_tensor_type.elem_type != src_tensor_type.elem_type:
- node_id = node.name if node.name else node.op_type
- raise ValueError(
- f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: "
- f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs "
- f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}"
- )
- if dst_tensor_type.HasField("shape"):
- for di, ds in enumerate(
- zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)
- ):
- if ds[0] != ds[1]:
- # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type
- # for sequence_type, clear the dimension
- new_dim = onnx.TensorShapeProto.Dimension()
- if not is_sequence(dst_type):
- new_dim.dim_param = str(
- self._new_symbolic_dim_from_output(node, out_idx, di)
- )
- dst_tensor_type.shape.dim[di].CopyFrom(new_dim)
- else:
- dst_tensor_type.CopyFrom(src_tensor_type)
-
- def _infer_ArrayFeatureExtractor(self, node):
- data_shape = self._get_shape(node, 0)
- indices_shape = self._get_shape(node, 1)
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- data_shape[:-1] + indices_shape,
- )
- )
-
- def _infer_symbolic_compute_ops(self, node):
- funcs = {
- "Add": lambda l: l[0] + l[1],
- "Div": lambda l: l[0] // l[1], # integer div in sympy
- "Equal": lambda l: l[0] == l[1],
- "Floor": lambda l: sympy.floor(l[0]),
- "Max": lambda l: l[1]
- if is_literal(l[0]) and int(l[0]) < -self.int_max_
- else (
- l[0]
- if is_literal(l[1]) and int(l[1]) < -self.int_max_
- else sympy.Max(l[0], l[1])
- ),
- "Min": lambda l: l[1]
- if is_literal(l[0]) and int(l[0]) > self.int_max_
- else (
- l[0]
- if is_literal(l[1]) and int(l[1]) > self.int_max_
- else sympy.Min(l[0], l[1])
- ),
- "Mul": lambda l: l[0] * l[1],
- "Sub": lambda l: l[0] - l[1],
- "Where": lambda l: l[1] if l[0] else l[2],
- "Neg": lambda l: -l[0],
- }
- assert node.op_type in funcs
- self._compute_on_sympy_data(node, funcs[node.op_type])
-
- def _infer_Cast(self, node):
- self._pass_on_sympy_data(node)
-
- def _infer_CategoryMapper(self, node):
- input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
- if input_type == onnx.TensorProto.STRING:
- output_type = onnx.TensorProto.INT64
- else:
- output_type = onnx.TensorProto.STRING
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], output_type, self._get_shape(node, 0)
- )
- )
-
- def _infer_Compress(self, node):
- input_shape = self._get_shape(node, 0)
- # create a new symbolic dimension for Compress output
- compress_len = str(self._new_symbolic_dim_from_output(node))
- axis = get_attribute(node, "axis")
- if axis == None:
- # when axis is not specified, input is flattened before compress so output is 1D
- output_shape = [compress_len]
- else:
- output_shape = input_shape
- output_shape[handle_negative_axis(axis, len(input_shape))] = compress_len
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- output_shape,
- )
- )
-
- def _infer_Concat(self, node):
- if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]):
- values = self._get_int_values(node)
- if all([v is not None for v in values]):
- assert 0 == get_attribute(node, "axis")
- self.sympy_data_[node.output[0]] = []
- for i in range(len(node.input)):
- value = values[i]
- if type(value) == list:
- self.sympy_data_[node.output[0]].extend(value)
- else:
- self.sympy_data_[node.output[0]].append(value)
-
- sympy_shape = self._get_sympy_shape(node, 0)
- axis = handle_negative_axis(get_attribute(node, "axis"), len(sympy_shape))
- for i_idx in range(1, len(node.input)):
- input_shape = self._get_sympy_shape(node, i_idx)
- if input_shape:
- sympy_shape[axis] = sympy_shape[axis] + input_shape[axis]
- self._update_computed_dims(sympy_shape)
- # merge symbolic dims for non-concat axes
- for d in range(len(sympy_shape)):
- if d == axis:
- continue
- dims = [
- self._get_shape(node, i_idx)[d]
- for i_idx in range(len(node.input))
- if self._get_shape(node, i_idx)
- ]
- if all([d == dims[0] for d in dims]):
- continue
- merged = self._merge_symbols(dims)
- if type(merged) == str:
- sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
- else:
- sympy_shape[d] = merged
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(sympy_shape),
- )
- )
-
- def _infer_ConcatFromSequence(self, node):
- seq_shape = self._get_shape(node, 0)
- new_axis = 1 if get_attribute(node, "new_axis") else 0
- axis = handle_negative_axis(
- get_attribute(node, "axis"), len(seq_shape) + new_axis
- )
- concat_dim = str(self._new_symbolic_dim_from_output(node, 0, axis))
- new_shape = seq_shape
- if new_axis:
- new_shape = seq_shape[:axis] + [concat_dim] + seq_shape[axis:]
- else:
- new_shape[axis] = concat_dim
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[
- node.input[0]
- ].type.sequence_type.elem_type.tensor_type.elem_type,
- new_shape,
- )
- )
-
- def _infer_Constant(self, node):
- t = get_attribute(node, "value")
- self.sympy_data_[node.output[0]] = numpy_helper.to_array(t)
-
- def _infer_ConstantOfShape(self, node):
- sympy_shape = self._get_int_values(node)[0]
- vi = self.known_vi_[node.output[0]]
- if sympy_shape is not None:
- if type(sympy_shape) != list:
- sympy_shape = [sympy_shape]
- self._update_computed_dims(sympy_shape)
- # update sympy data if output type is int, and shape is known
- if vi.type.tensor_type.elem_type == onnx.TensorProto.INT64 and all(
- [is_literal(x) for x in sympy_shape]
- ):
- self.sympy_data_[node.output[0]] = np.ones(
- [int(x) for x in sympy_shape], dtype=np.int64
- ) * numpy_helper.to_array(get_attribute(node, "value", 0))
- else:
- # create new dynamic shape
- # note input0 is a 1D vector of shape, the new symbolic shape has the rank of the shape vector length
- sympy_shape = self._new_symbolic_shape(self._get_shape(node, 0)[0], node)
-
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(sympy_shape),
- )
- )
-
- def _infer_Conv(self, node):
- sympy_shape = self._compute_conv_pool_shape(node)
- self._update_computed_dims(sympy_shape)
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(sympy_shape),
- )
- )
-
- def _infer_Einsum(self, node):
- # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
- equation = get_attribute(node, "equation")
- equation = equation.replace(b" ", b"")
- mid_index = equation.find(b"->")
- left_equation = equation[:mid_index] if mid_index != -1 else equation
-
- num_operands = 0
- num_ellipsis = 0
- num_ellipsis_indices = 0
-
- letter_to_dim = {}
-
- terms = left_equation.split(b",")
- for term in terms:
- ellipsis_index = term.find(b"...")
- shape = self._get_shape(node, num_operands)
- rank = len(shape)
- if ellipsis_index != -1:
- if num_ellipsis == 0:
- num_ellipsis_indices = rank - len(term) + 3
- num_ellipsis = num_ellipsis + 1
- for i in range(1, rank + 1):
- letter = term[-i]
- if letter != 46: # letter != b'.'
- dim = shape[-i]
- if letter not in letter_to_dim.keys():
- letter_to_dim[letter] = dim
- elif type(dim) != sympy.Symbol:
- letter_to_dim[letter] = dim
- num_operands = num_operands + 1
-
- new_sympy_shape = []
- from collections import OrderedDict
-
- num_letter_occurrences = OrderedDict()
- if mid_index != -1:
- right_equation = equation[mid_index + 2 :]
- right_ellipsis_index = right_equation.find(b"...")
- if right_ellipsis_index != -1:
- for i in range(num_ellipsis_indices):
- new_sympy_shape.append(shape[i])
- for c in right_equation:
- if c != 46: # c != b'.'
- new_sympy_shape.append(letter_to_dim[c])
- else:
- for i in range(num_ellipsis_indices):
- new_sympy_shape.append(shape[i])
- for c in left_equation:
- if c != 44 and c != 46: # c != b',' and c != b'.':
- if c in num_letter_occurrences:
- num_letter_occurrences[c] = num_letter_occurrences[c] + 1
- else:
- num_letter_occurrences[c] = 1
- for key, value in num_letter_occurrences.items():
- if value == 1:
- new_sympy_shape.append(letter_to_dim[key])
-
- output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape)
- )
-
- def _infer_Expand(self, node):
- expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True)
- if expand_to_shape is not None:
- # new_shape's dim can come from shape value
- self._update_computed_dims(expand_to_shape)
- shape = self._get_shape(node, 0)
- new_shape = self._broadcast_shapes(
- shape, get_shape_from_sympy_shape(expand_to_shape)
- )
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- new_shape,
- )
- )
-
- def _infer_Gather(self, node):
- data_shape = self._get_shape(node, 0)
- axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape))
- indices_shape = self._get_shape(node, 1)
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- data_shape[:axis] + indices_shape + data_shape[axis + 1 :],
- )
- )
- # for 1D input, do some sympy compute
- if (
- node.input[0] in self.sympy_data_
- and len(data_shape) == 1
- and 0 == get_attribute(node, "axis", 0)
- ):
- idx = self._try_get_value(node, 1)
- if idx is not None:
- data = self.sympy_data_[node.input[0]]
- if type(data) == list:
- if type(idx) == np.ndarray and len(idx.shape) == 1:
- self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
- else:
- self.sympy_data_[node.output[0]] = data[int(idx)]
- else:
- assert idx == 0 or idx == -1
- self.sympy_data_[node.output[0]] = data
-
- def _infer_GatherElements(self, node):
- indices_shape = self._get_shape(node, 1)
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- indices_shape,
- )
- )
-
- def _infer_GatherND(self, node):
- data_shape = self._get_shape(node, 0)
- data_rank = len(data_shape)
- indices_shape = self._get_shape(node, 1)
- indices_rank = len(indices_shape)
- last_index_dimension = indices_shape[-1]
- assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
- new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- new_shape,
- )
- )
-
- def _infer_If(self, node):
- # special case for constant condition, in case there are mismatching shape from the non-executed branch
- subgraphs = [
- get_attribute(node, "then_branch"),
- get_attribute(node, "else_branch"),
- ]
- cond = self._try_get_value(node, 0)
- if cond is not None:
- if as_scalar(cond) > 0:
- subgraphs[1].CopyFrom(subgraphs[0])
- else:
- subgraphs[0].CopyFrom(subgraphs[1])
-
- for i_sub, subgraph in enumerate(subgraphs):
- subgraph_infer = self._onnx_infer_subgraph(
- node, subgraph, use_node_input=False
- )
- for i_out in range(len(node.output)):
- vi = self.known_vi_[node.output[i_out]]
- if i_sub == 0:
- vi.CopyFrom(subgraph.output[i_out])
- vi.name = node.output[i_out]
- else:
- self._fuse_tensor_type(
- node, i_out, vi.type, subgraph.output[i_out].type
- )
-
- # pass on sympy data from subgraph, if cond is constant
- if cond is not None and i_sub == (0 if as_scalar(cond) > 0 else 1):
- if subgraph.output[i_out].name in subgraph_infer.sympy_data_:
- self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[
- subgraph.output[i_out].name
- ]
-
- def _infer_Loop(self, node):
- subgraph = get_attribute(node, "body")
- assert len(subgraph.input) == len(node.input)
- num_loop_carried = (
- len(node.input) - 2
- ) # minus the length and initial loop condition
- # when sequence_type is used as loop carried input
- # needs to run subgraph infer twice if the tensor shape in sequence contains None
- for i, si in enumerate(subgraph.input):
- si_name = si.name
- si.CopyFrom(self.known_vi_[node.input[i]])
- si.name = si_name
-
- self._onnx_infer_subgraph(node, subgraph)
-
- # check subgraph input/output for shape changes in loop carried variables
- # for tensor_type, create new symbolic dim when changing, i.e., output = Concat(input, a)
- # for sequence_type, propagate from output to input
- need_second_infer = False
- for i_out in range(1, num_loop_carried + 1):
- so = subgraph.output[i_out]
- so_shape = get_shape_from_value_info(so)
- if is_sequence(so.type):
- if so_shape and None in so_shape:
- # copy shape from output to input
- # note that loop input is [loop_len, cond, input_0, input_1, ...]
- # while loop output is [cond, output_0, output_1, ...]
- subgraph.input[i_out + 1].type.sequence_type.elem_type.CopyFrom(
- so.type.sequence_type.elem_type
- )
- need_second_infer = True
- else:
- si = subgraph.input[i_out + 1]
- si_shape = get_shape_from_value_info(si)
- for di, dims in enumerate(zip(si_shape, so_shape)):
- if dims[0] != dims[1]:
- new_dim = onnx.TensorShapeProto.Dimension()
- new_dim.dim_param = str(
- self._new_symbolic_dim_from_output(node, i_out, di)
- )
- si.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
- so.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
- need_second_infer = True
-
- if need_second_infer:
- if self.verbose_ > 2:
- logger.debug(
- "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
- node.name, node.output[0]
- )
- )
- self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
-
- # create a new symbolic dimension for iteration dependent dimension
- loop_iter_dim = str(self._new_symbolic_dim_from_output(node))
- for i in range(len(node.output)):
- vi = self.known_vi_[node.output[i]]
- vi.CopyFrom(
- subgraph.output[i + 1]
- ) # first subgraph output is condition, not in node output
- if i >= num_loop_carried:
- assert not is_sequence(
- vi.type
- ) # TODO: handle loop accumulation in sequence_type
- subgraph_vi_dim = subgraph.output[i + 1].type.tensor_type.shape.dim
- vi.type.tensor_type.shape.ClearField("dim")
- vi_dim = vi.type.tensor_type.shape.dim
- vi_dim.add().dim_param = loop_iter_dim
- vi_dim.extend(list(subgraph_vi_dim))
- vi.name = node.output[i]
-
- def _infer_MatMul(self, node):
- self._compute_matmul_shape(node)
-
- def _infer_MatMulInteger(self, node):
- self._compute_matmul_shape(node, onnx.TensorProto.INT32)
-
- def _infer_NonMaxSuppression(self, node):
- selected = str(self._new_symbolic_dim_from_output(node))
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], onnx.TensorProto.INT64, [selected, 3]
- )
- )
-
- def _infer_NonZero(self, node):
- input_rank = self._get_shape_rank(node, 0)
- # create a new symbolic dimension for NonZero output
- nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1))
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len]
- )
- )
-
- def _infer_OneHot(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- depth = self._try_get_value(node, 1)
- axis = get_attribute(node, "axis", -1)
- axis = handle_negative_axis(axis, len(sympy_shape) + 1)
- new_shape = get_shape_from_sympy_shape(
- sympy_shape[:axis]
- + [
- self._new_symbolic_dim_from_output(node)
- if not is_literal(depth)
- else depth
- ]
- + sympy_shape[axis:]
- )
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[2]].type.tensor_type.elem_type,
- new_shape,
- )
- )
-
- def _infer_Pad(self, node):
- if get_opset(self.out_mp_) <= 10:
- pads = get_attribute(node, "pads")
- else:
- pads = self._try_get_value(node, 1)
-
- sympy_shape = self._get_sympy_shape(node, 0)
- rank = len(sympy_shape)
-
- if pads is not None:
- assert len(pads) == 2 * rank
- new_sympy_shape = [
- d + pad_up + pad_down
- for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])
- ]
- self._update_computed_dims(new_sympy_shape)
- else:
- # dynamic pads, create new symbolic dimensions
- new_sympy_shape = self._new_symbolic_shape(rank, node)
- output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape)
- )
- )
-
- def _infer_Pool(self, node):
- sympy_shape = self._compute_conv_pool_shape(node)
- self._update_computed_dims(sympy_shape)
- for o in node.output:
- if not o:
- continue
- vi = self.known_vi_[o]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- o,
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(sympy_shape),
- )
- )
-
- def _infer_aten_bitwise_or(self, node):
- shape0 = self._get_shape(node, 0)
- shape1 = self._get_shape(node, 1)
- new_shape = self._broadcast_shapes(shape0, shape1)
- t0 = self.known_vi_[node.input[0]]
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], t0.type.tensor_type.elem_type, new_shape
- )
- )
-
- def _infer_aten_diagonal(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- rank = len(sympy_shape)
- offset = self._try_get_value(node, 1)
- dim1 = self._try_get_value(node, 2)
- dim2 = self._try_get_value(node, 3)
-
- assert offset is not None and dim1 is not None and dim2 is not None
- dim1 = handle_negative_axis(dim1, rank)
- dim2 = handle_negative_axis(dim2, rank)
-
- new_shape = []
- for dim, val in enumerate(sympy_shape):
- if dim not in [dim1, dim2]:
- new_shape.append(val)
-
- shape1 = sympy_shape[dim1]
- shape2 = sympy_shape[dim2]
- if offset >= 0:
- diag_shape = sympy.Max(0, sympy.Min(shape1, shape2 - offset))
- else:
- diag_shape = sympy.Max(0, sympy.Min(shape1 + offset, shape2))
- new_shape.append(diag_shape)
-
- if node.output[0]:
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_shape),
- )
- )
-
- def _infer_aten_multinomial(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- rank = len(sympy_shape)
- assert rank in [1, 2]
- num_samples = self._try_get_value(node, 1)
- di = rank - 1
- last_dim = (
- num_samples
- if num_samples
- else str(self._new_symbolic_dim_from_output(node, 0, di))
- )
- output_shape = sympy_shape[:-1] + [last_dim]
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- onnx.TensorProto.INT64,
- get_shape_from_sympy_shape(output_shape),
- )
- )
-
- def _infer_aten_pool2d(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- assert len(sympy_shape) == 4
- sympy_shape[-2:] = [
- self._new_symbolic_dim_from_output(node, 0, i) for i in [2, 3]
- ]
- self._update_computed_dims(sympy_shape)
- for i, o in enumerate(node.output):
- if not o:
- continue
- vi = self.known_vi_[o]
- elem_type = (
- onnx.TensorProto.INT64
- if i == 1
- else self.known_vi_[node.input[0]].type.tensor_type.elem_type
- )
- vi.CopyFrom(
- helper.make_tensor_value_info(
- o, elem_type, get_shape_from_sympy_shape(sympy_shape)
- )
- )
-
- def _infer_aten_minmax(self, node):
- vi = self.known_vi_[node.output[0]]
- if len(node.input) == 1:
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- [],
- )
- )
- else:
- assert len(node.input) == 3
- keepdim = self._try_get_value(node, 2)
- assert keepdim is not None # can only handle known keepdim case.
- dim = self._try_get_value(node, 1)
- if dim is None:
- rank = self._get_shape_rank(node, 0)
- output_shape = self._new_symbolic_shape(
- rank if keepdim else rank - 1, node
- )
- else:
- shape = self._get_sympy_shape(node, 0)
- dim = handle_negative_axis(dim, len(shape))
- output_shape = shape[:dim]
- if keepdim:
- output_shape += [1]
- output_shape += shape[dim + 1 :]
-
- output_shape = get_shape_from_sympy_shape(output_shape)
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- output_shape,
- )
- )
- vi1 = self.known_vi_[node.output[1]]
- vi1.CopyFrom(
- helper.make_tensor_value_info(
- node.output[1], onnx.TensorProto.INT64, output_shape
- )
- )
-
- def _infer_aten_unfold(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- dimension = self._try_get_value(node, 1)
- size = self._try_get_value(node, 2)
- step = self._try_get_value(node, 3)
- if dimension is not None and size is not None and step is not None:
- assert dimension < len(sympy_shape)
- sympy_shape[dimension] = (sympy_shape[dimension] - size) // step + 1
- sympy_shape.append(size)
- else:
- rank = len(sympy_shape)
- sympy_shape = self._new_symbolic_shape(rank + 1, node)
- self._update_computed_dims(sympy_shape)
- if node.output[0]:
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(sympy_shape),
- )
- )
-
- def _infer_aten_argmax(self, node):
- new_shape = None
- if node.input[1] == "":
- # The argmax of the flattened input is returned.
- new_shape = []
- else:
- dim = self._try_get_value(node, 1)
- keepdim = self._try_get_value(node, 2)
- if keepdim is not None:
- sympy_shape = self._get_sympy_shape(node, 0)
- if dim is not None:
- dim = handle_negative_axis(dim, len(sympy_shape))
- if keepdim:
- sympy_shape[dim] = 1
- else:
- del sympy_shape[dim]
- else:
- rank = len(sympy_shape)
- sympy_shape = self._new_symbolic_shape(
- rank if keepdim else rank - 1, node
- )
- self._update_computed_dims(sympy_shape)
- new_shape = get_shape_from_sympy_shape(sympy_shape)
- if node.output[0] and new_shape is not None:
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], onnx.TensorProto.INT64, new_shape
- )
- )
-
- def _infer_BatchNormalization(self, node):
- self._propagate_shape_and_type(node)
-
- # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop
- for i in [1, 2, 3, 4]:
- if i < len(node.output) and node.output[i] != "":
- # all of these parameters have the same shape as the 1st input
- self._propagate_shape_and_type(node, input_index=1, output_index=i)
-
- def _infer_Range(self, node):
- vi = self.known_vi_[node.output[0]]
- input_data = self._get_int_values(node)
- if all([i is not None for i in input_data]):
- start = as_scalar(input_data[0])
- limit = as_scalar(input_data[1])
- delta = as_scalar(input_data[2])
- new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start) / delta), 0)]
- else:
- new_sympy_shape = [self._new_symbolic_dim_from_output(node)]
- self._update_computed_dims(new_sympy_shape)
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_sympy_shape),
- )
- )
-
- def _infer_ReduceSum(self, node):
- keep_dims = get_attribute(node, "keepdims", 1)
- if get_opset(self.out_mp_) >= 13 and len(node.input) > 1:
- # ReduceSum changes axes to input[1] in opset 13
- axes = self._try_get_value(node, 1)
- vi = self.known_vi_[node.output[0]]
- if axes is None:
- assert keep_dims # can only handle keep_dims==True when axes is unknown, by generating new ranks
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(
- self._new_symbolic_shape(
- self._get_shape_rank(node, 0), node
- )
- ),
- )
- )
- else:
- shape = self._get_shape(node, 0)
- output_shape = []
- axes = [handle_negative_axis(a, len(shape)) for a in axes]
- for i, d in enumerate(shape):
- if i in axes:
- if keep_dims:
- output_shape.append(1)
- else:
- output_shape.append(d)
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- output_shape,
- )
- )
-
- def _infer_ReduceProd(self, node):
- axes = get_attribute(node, "axes")
- keep_dims = get_attribute(node, "keepdims", 1)
- if keep_dims == 0 and axes == [0]:
- data = self._get_int_values(node)[0]
- if data is not None:
- self.sympy_data_[node.output[0]] = sympy_reduce_product(data)
-
- def _infer_Reshape(self, node):
- shape_value = self._try_get_value(node, 1)
- vi = self.known_vi_[node.output[0]]
- if shape_value is None:
- shape_shape = self._get_shape(node, 1)
- assert len(shape_shape) == 1
- shape_rank = shape_shape[0]
- assert is_literal(shape_rank)
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(
- self._new_symbolic_shape(shape_rank, node)
- ),
- )
- )
- else:
- input_sympy_shape = self._get_sympy_shape(node, 0)
- total = int(1)
- for d in input_sympy_shape:
- total = total * d
- new_sympy_shape = []
- deferred_dim_idx = -1
- non_deferred_size = int(1)
- for i, d in enumerate(shape_value):
- if type(d) == sympy.Symbol:
- new_sympy_shape.append(d)
- elif d == 0:
- new_sympy_shape.append(input_sympy_shape[i])
- non_deferred_size = non_deferred_size * input_sympy_shape[i]
- else:
- new_sympy_shape.append(d)
- if d == -1:
- deferred_dim_idx = i
- elif d != 0:
- non_deferred_size = non_deferred_size * d
-
- assert new_sympy_shape.count(-1) < 2
- if -1 in new_sympy_shape:
- new_dim = total // non_deferred_size
- new_sympy_shape[deferred_dim_idx] = new_dim
-
- self._update_computed_dims(new_sympy_shape)
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_sympy_shape),
- )
- )
-
- self._pass_on_sympy_data(node)
-
- def _infer_Resize(self, node):
- vi = self.known_vi_[node.output[0]]
- input_sympy_shape = self._get_sympy_shape(node, 0)
- if get_opset(self.out_mp_) <= 10:
- scales = self._try_get_value(node, 1)
- if scales is not None:
- new_sympy_shape = [
- sympy.simplify(sympy.floor(d * s))
- for d, s in zip(input_sympy_shape, scales)
- ]
- self._update_computed_dims(new_sympy_shape)
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_sympy_shape),
- )
- )
- else:
- roi = self._try_get_value(node, 1)
- scales = self._try_get_value(node, 2)
- sizes = self._try_get_value(node, 3)
- if sizes is not None:
- new_sympy_shape = [sympy.simplify(sympy.floor(s)) for s in sizes]
- self._update_computed_dims(new_sympy_shape)
- elif scales is not None:
- rank = len(scales)
- if (
- get_attribute(node, "coordinate_transformation_mode")
- == "tf_crop_and_resize"
- ):
- assert len(roi) == 2 * rank
- roi_start = list(roi)[:rank]
- roi_end = list(roi)[rank:]
- else:
- roi_start = [0] * rank
- roi_end = [1] * rank
- scales = list(scales)
- new_sympy_shape = [
- sympy.simplify(sympy.floor(d * (end - start) * scale))
- for d, start, end, scale in zip(
- input_sympy_shape, roi_start, roi_end, scales
- )
- ]
- self._update_computed_dims(new_sympy_shape)
- else:
- new_sympy_shape = self._new_symbolic_shape(
- self._get_shape_rank(node, 0), node
- )
-
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_sympy_shape),
- )
- )
-
- def _infer_Scan(self, node):
- subgraph = get_attribute(node, "body")
- num_scan_inputs = get_attribute(node, "num_scan_inputs")
- scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs)
- num_scan_states = len(node.input) - num_scan_inputs
- scan_input_axes = [
- handle_negative_axis(ax, self._get_shape_rank(node, i + num_scan_states))
- for i, ax in enumerate(scan_input_axes)
- ]
- # We may have cases where the subgraph has optional inputs that appear in both subgraph's input and initializer,
- # but not in the node's input. In such cases, the input model might be invalid, but let's skip those optional inputs.
- assert len(subgraph.input) >= len(node.input)
- subgraph_inputs = subgraph.input[: len(node.input)]
- for i, si in enumerate(subgraph_inputs):
- subgraph_name = si.name
- si.CopyFrom(self.known_vi_[node.input[i]])
- if i >= num_scan_states:
- scan_input_dim = si.type.tensor_type.shape.dim
- scan_input_dim.remove(
- scan_input_dim[scan_input_axes[i - num_scan_states]]
- )
- si.name = subgraph_name
- self._onnx_infer_subgraph(node, subgraph)
- num_scan_outputs = len(node.output) - num_scan_states
- scan_output_axes = get_attribute(
- node, "scan_output_axes", [0] * num_scan_outputs
- )
- scan_input_dim = get_shape_from_type_proto(self.known_vi_[node.input[-1]].type)[
- scan_input_axes[-1]
- ]
- for i, o in enumerate(node.output):
- vi = self.known_vi_[o]
- if i >= num_scan_states:
- shape = get_shape_from_type_proto(subgraph.output[i].type)
- new_dim = handle_negative_axis(
- scan_output_axes[i - num_scan_states], len(shape) + 1
- )
- shape = shape[:new_dim] + [scan_input_dim] + shape[new_dim:]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- o, subgraph.output[i].type.tensor_type.elem_type, shape
- )
- )
- else:
- vi.CopyFrom(subgraph.output[i])
- vi.name = o
-
- def _infer_ScatterElements(self, node):
- data_shape = self._get_shape(node, 0)
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- data_shape,
- )
- )
-
- def _infer_SequenceAt(self, node):
- # need to create new symbolic dimension if sequence shape has None:
- seq_shape = self._get_shape(node, 0)
- vi = self.known_vi_[node.output[0]]
- if seq_shape is not None:
- for di, d in enumerate(seq_shape):
- if d is not None:
- continue
- new_dim = onnx.TensorShapeProto.Dimension()
- new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di))
- vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
-
- def _infer_SequenceInsert(self, node):
- # workaround bug in onnx's shape inference
- vi_seq = self.known_vi_[node.input[0]]
- vi_tensor = self.known_vi_[node.input[1]]
- vi_out_seq = self.known_vi_[node.output[0]]
- vi_out_seq.CopyFrom(vi_seq)
- vi_out_seq.name = node.output[0]
- self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type)
-
- def _infer_Shape(self, node):
- self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0)
-
- def _infer_Size(self, node):
- sympy_shape = self._get_sympy_shape(node, 0)
- self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape)
- self.known_vi_[node.output[0]].CopyFrom(
- helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
- )
-
- def _infer_Slice(self, node):
- def less_equal(x, y):
- try:
- return bool(x <= y)
- except TypeError:
- pass
- try:
- return bool(y >= x)
- except TypeError:
- pass
- try:
- return bool(-x >= -y)
- except TypeError:
- pass
- try:
- return bool(-y <= -x)
- except TypeError:
- # the last attempt; this may raise TypeError
- return bool(y - x >= 0)
-
- def handle_negative_index(index, bound):
- """normalizes a negative index to be in [0, bound)"""
- try:
- if not less_equal(0, index):
- if is_literal(index) and index <= -self.int_max_:
- # this case is handled separately
- return index
- return bound + index
- except TypeError:
- logger.warning("Cannot determine if {} < 0".format(index))
- return index
-
- if get_opset(self.out_mp_) <= 9:
- axes = get_attribute(node, "axes")
- starts = get_attribute(node, "starts")
- ends = get_attribute(node, "ends")
- if not axes:
- axes = list(range(len(starts)))
- steps = [1] * len(axes)
- else:
- starts = as_list(self._try_get_value(node, 1), keep_none=True)
- ends = as_list(self._try_get_value(node, 2), keep_none=True)
- axes = self._try_get_value(node, 3)
- steps = self._try_get_value(node, 4)
- if axes is None and not (starts is None and ends is None):
- axes = list(range(0, len(starts if starts is not None else ends)))
- if steps is None and not (starts is None and ends is None):
- steps = [1] * len(starts if starts is not None else ends)
- axes = as_list(axes, keep_none=True)
- steps = as_list(steps, keep_none=True)
-
- new_sympy_shape = self._get_sympy_shape(node, 0)
- if starts is None or ends is None:
- if axes is None:
- for i in range(len(new_sympy_shape)):
- new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
- else:
- new_sympy_shape = get_shape_from_sympy_shape(new_sympy_shape)
- for i in axes:
- new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
- else:
- for i, s, e, t in zip(axes, starts, ends, steps):
- e = handle_negative_index(e, new_sympy_shape[i])
- if is_literal(e):
- if e >= self.int_max_:
- e = new_sympy_shape[i]
- elif e <= -self.int_max_:
- e = 0 if s > 0 else -1
- elif is_literal(new_sympy_shape[i]):
- if e < 0:
- e = max(0, e + new_sympy_shape[i])
- e = min(e, new_sympy_shape[i])
- else:
- if e > 0:
- e = (
- sympy.Min(e, new_sympy_shape[i]) if e > 1 else e
- ) # special case for slicing first to make computation easier
- else:
- if is_literal(new_sympy_shape[i]):
- e = sympy.Min(e, new_sympy_shape[i])
- else:
- try:
- if not less_equal(e, new_sympy_shape[i]):
- e = new_sympy_shape[i]
- except Exception:
- logger.warning(
- "Unable to determine if {} <= {}, treat as equal".format(
- e, new_sympy_shape[i]
- )
- )
- e = new_sympy_shape[i]
-
- s = handle_negative_index(s, new_sympy_shape[i])
- if is_literal(new_sympy_shape[i]) and is_literal(s):
- s = max(0, min(s, new_sympy_shape[i]))
-
- new_sympy_shape[i] = sympy.simplify(
- (e - s + t + (-1 if t > 0 else 1)) // t
- )
-
- self._update_computed_dims(new_sympy_shape)
-
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_sympy_shape),
- )
- )
-
- # handle sympy_data if needed, for slice in shape computation
- if (
- node.input[0] in self.sympy_data_
- and [0] == axes
- and len(starts) == 1
- and len(ends) == 1
- and len(steps) == 1
- ):
- input_sympy_data = self.sympy_data_[node.input[0]]
- if type(input_sympy_data) == list or (
- type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
- ):
- self.sympy_data_[node.output[0]] = input_sympy_data[
- starts[0] : ends[0] : steps[0]
- ]
-
- def _infer_SoftmaxCrossEntropyLoss(self, node):
- vi = self.known_vi_[node.output[0]]
- elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
- vi.type.tensor_type.elem_type = elem_type
- vi.type.tensor_type.shape.CopyFrom(onnx.TensorShapeProto())
-
- if len(node.output) > 1:
- data_shape = self._get_shape(node, 0)
- vi = self.known_vi_[node.output[1]]
- vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape))
-
- def _infer_Split_Common(self, node, make_value_info_func):
- input_sympy_shape = self._get_sympy_shape(node, 0)
- axis = handle_negative_axis(
- get_attribute(node, "axis", 0), len(input_sympy_shape)
- )
- split = get_attribute(node, "split")
- if not split:
- num_outputs = len(node.output)
- split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
- self._update_computed_dims(split)
- else:
- split = [sympy.Integer(s) for s in split]
-
- for i_o in range(len(split)):
- vi = self.known_vi_[node.output[i_o]]
- vi.CopyFrom(
- make_value_info_func(
- node.output[i_o],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- get_shape_from_sympy_shape(
- input_sympy_shape[:axis]
- + [split[i_o]]
- + input_sympy_shape[axis + 1 :]
- ),
- )
- )
- self.known_vi_[vi.name] = vi
-
- def _infer_Split(self, node):
- self._infer_Split_Common(node, helper.make_tensor_value_info)
-
- def _infer_SplitToSequence(self, node):
- self._infer_Split_Common(node, helper.make_sequence_value_info)
-
- def _infer_Squeeze(self, node):
- input_shape = self._get_shape(node, 0)
- op_set = get_opset(self.out_mp_)
-
- # Depending on op-version 'axes' are provided as attribute or via 2nd input
- if op_set < 13:
- axes = get_attribute(node, "axes")
- assert self._try_get_value(node, 1) is None
- else:
- axes = self._try_get_value(node, 1)
- assert get_attribute(node, "axes") is None
-
- if axes is None:
- # No axes have been provided (neither via attribute nor via input).
- # In this case the 'Shape' op should remove all axis with dimension 1.
- # For symbolic dimensions we guess they are !=1.
- output_shape = [s for s in input_shape if s != 1]
- if self.verbose_ > 0:
- symbolic_dimensions = [s for s in input_shape if type(s) != int]
- if len(symbolic_dimensions) > 0:
- logger.debug(
- f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
- + f"Assuming the following dimensions are never equal to 1: {symbolic_dimensions}"
- )
- else:
- axes = [handle_negative_axis(a, len(input_shape)) for a in axes]
- output_shape = []
- for i in range(len(input_shape)):
- if i not in axes:
- output_shape.append(input_shape[i])
- else:
- assert input_shape[i] == 1 or type(input_shape[i]) != int
- if self.verbose_ > 0 and type(input_shape[i]) != int:
- logger.debug(
- f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
- + f"Assuming the dimension '{input_shape[i]}' at index {i} of the input to be equal to 1."
- )
-
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- output_shape,
- )
- )
- self._pass_on_sympy_data(node)
-
- def _infer_Tile(self, node):
- repeats_value = self._try_get_value(node, 1)
- new_sympy_shape = []
- if repeats_value is not None:
- input_sympy_shape = self._get_sympy_shape(node, 0)
- for i, d in enumerate(input_sympy_shape):
- new_dim = d * repeats_value[i]
- new_sympy_shape.append(new_dim)
- self._update_computed_dims(new_sympy_shape)
- else:
- new_sympy_shape = self._new_symbolic_shape(
- self._get_shape_rank(node, 0), node
- )
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- vi.type.tensor_type.elem_type,
- get_shape_from_sympy_shape(new_sympy_shape),
- )
- )
-
- def _infer_TopK(self, node):
- rank = self._get_shape_rank(node, 0)
- axis = handle_negative_axis(get_attribute(node, "axis", -1), rank)
- new_shape = self._get_shape(node, 0)
-
- if get_opset(self.out_mp_) <= 9:
- k = get_attribute(node, "k")
- else:
- k = self._get_int_values(node)[1]
-
- if k == None:
- k = self._new_symbolic_dim_from_output(node)
- else:
- k = as_scalar(k)
-
- if type(k) in [int, str]:
- new_shape[axis] = k
- else:
- new_sympy_shape = self._get_sympy_shape(node, 0)
- new_sympy_shape[axis] = k
- self._update_computed_dims(
- new_sympy_shape
- ) # note that TopK dim could be computed in sympy_data, so need to update computed_dims when it enters shape
- new_shape = get_shape_from_sympy_shape(new_sympy_shape)
-
- for i_o in range(len(node.output)):
- vi = self.known_vi_[node.output[i_o]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[i_o], vi.type.tensor_type.elem_type, new_shape
- )
- )
-
- def _infer_Transpose(self, node):
- if node.input[0] in self.sympy_data_:
- data_shape = self._get_shape(node, 0)
- perm = get_attribute(node, "perm", reversed(list(range(len(data_shape)))))
- input_data = self.sympy_data_[node.input[0]]
- self.sympy_data_[node.output[0]] = (
- np.transpose(
- np.array(input_data).reshape(*data_shape), axes=tuple(perm)
- )
- .flatten()
- .tolist()
- )
-
- def _infer_Unsqueeze(self, node):
- input_shape = self._get_shape(node, 0)
- op_set = get_opset(self.out_mp_)
-
- # Depending on op-version 'axes' are provided as attribute or via 2nd input
- if op_set < 13:
- axes = get_attribute(node, "axes")
- assert self._try_get_value(node, 1) is None
- else:
- axes = self._try_get_value(node, 1)
- assert get_attribute(node, "axes") is None
-
- output_rank = len(input_shape) + len(axes)
- axes = [handle_negative_axis(a, output_rank) for a in axes]
-
- input_axis = 0
- output_shape = []
- for i in range(output_rank):
- if i in axes:
- output_shape.append(1)
- else:
- output_shape.append(input_shape[input_axis])
- input_axis += 1
-
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0],
- self.known_vi_[node.input[0]].type.tensor_type.elem_type,
- output_shape,
- )
- )
-
- self._pass_on_sympy_data(node)
-
- def _infer_ZipMap(self, node):
- map_key_type = None
- if get_attribute(node, "classlabels_int64s") is not None:
- map_key_type = onnx.TensorProto.INT64
- elif get_attribute(node, "classlabels_strings") is not None:
- map_key_type = onnx.TensorProto.STRING
-
- assert map_key_type is not None
- new_vi = onnx.ValueInfoProto()
- new_vi.name = node.output[0]
- new_vi.type.sequence_type.elem_type.map_type.value_type.tensor_type.elem_type = (
- onnx.TensorProto.FLOAT
- )
- new_vi.type.sequence_type.elem_type.map_type.key_type = map_key_type
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(new_vi)
-
- def _infer_Attention(self, node):
- shape = self._get_shape(node, 0)
- shape_bias = self._get_shape(node, 2)
- assert len(shape) == 3 and len(shape_bias) == 1
- qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes")
- if qkv_hidden_sizes_attr is not None:
- assert len(qkv_hidden_sizes_attr) == 3
- shape[2] = int(qkv_hidden_sizes_attr[2])
- else:
- shape[2] = int(shape_bias[0] / 3)
- output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape))
-
- if len(node.output) > 1:
- # input shape: (batch_size, sequence_length, hidden_size)
- # past shape: (2, batch_size, num_heads, past_sequence_length, head_size)
- # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len)
- # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length
- input_shape = self._get_shape(node, 0)
- past_shape = self._get_shape(node, 4)
- mask_shape = self._get_shape(node, 3)
- if len(past_shape) == 5:
- if len(mask_shape) in [2, 3]:
- past_shape[3] = mask_shape[-1]
- elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int):
- past_shape[3] = input_shape[1] + past_shape[3]
- else:
- past_shape[3] = f"{past_shape[3]}+{input_shape[1]}"
- vi = self.known_vi_[node.output[1]]
- vi.CopyFrom(
- helper.make_tensor_value_info(vi.name, output_dtype, past_shape)
- )
-
- def _infer_BiasGelu(self, node):
- self._propagate_shape_and_type(node)
-
- def _infer_FastGelu(self, node):
- self._propagate_shape_and_type(node)
-
- def _infer_Gelu(self, node):
- self._propagate_shape_and_type(node)
-
- def _infer_LayerNormalization(self, node):
- self._propagate_shape_and_type(node)
-
- def _infer_LongformerAttention(self, node):
- self._propagate_shape_and_type(node)
-
- def _infer_EmbedLayerNormalization(self, node):
- input_ids_shape = self._get_shape(node, 0)
- word_embedding_shape = self._get_shape(node, 2)
- assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2
- output_shape = input_ids_shape + [word_embedding_shape[1]]
-
- word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[0], word_embedding_dtype, output_shape
- )
- )
-
- mask_index_shape = [input_ids_shape[0]]
- vi = self.known_vi_[node.output[1]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[1], onnx.TensorProto.INT32, mask_index_shape
- )
- )
-
- if len(node.output) > 2:
- # Optional output of add before layer nomalization is done
- # shape is same as the output
- vi = self.known_vi_[node.output[2]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[2], word_embedding_dtype, output_shape
- )
- )
-
- def _infer_SkipLayerNormalization(self, node):
- self._propagate_shape_and_type(node)
-
- def _infer_PythonOp(self, node):
- output_tensor_types = get_attribute(node, "output_tensor_types")
- assert output_tensor_types
- output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
- assert output_tensor_ranks
-
- # set the context output seperately.
- # The first output is autograd's context.
- vi = self.known_vi_[node.output[0]]
- vi.CopyFrom(
- helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
- )
-
- # Outputs after autograd's context are tensors.
- # We assume their ranks are fixed for different model inputs.
- for i in range(len(node.output) - 1):
- # Process the i-th tensor outputs.
- vi = self.known_vi_[node.output[i + 1]]
- sympy_shape = self._new_symbolic_shape(output_tensor_ranks[i], node)
- shape = get_shape_from_sympy_shape(sympy_shape)
- value_info = helper.make_tensor_value_info(
- node.output[i + 1], output_tensor_types[i], shape
- )
- vi.CopyFrom(value_info)
-
- def _propagate_shape_and_type(self, node, input_index=0, output_index=0):
- shape = self._get_shape(node, input_index)
- output_dtype = self.known_vi_[
- node.input[input_index]
- ].type.tensor_type.elem_type
- vi = self.known_vi_[node.output[output_index]]
- vi.CopyFrom(
- helper.make_tensor_value_info(
- node.output[output_index], output_dtype, shape
- )
- )
-
- def _is_none_dim(self, dim_value):
- if type(dim_value) != str:
- return False
- if "unk__" not in dim_value:
- return False
- if dim_value in self.symbolic_dims_.keys():
- return False
- return True
-
- def _is_shape_contains_none_dim(self, out_shape):
- for out in out_shape:
- if self._is_none_dim(out):
- return out
- return None
-
- def _infer_impl(self, start_sympy_data=None):
- self.sympy_data_ = start_sympy_data or {}
- self.out_mp_.graph.ClearField("value_info")
- self._apply_suggested_merge(graph_input_only=True)
- self.input_symbols_ = set()
- for i in self.out_mp_.graph.input:
- input_shape = get_shape_from_value_info(i)
- if input_shape is None:
- continue
-
- if is_sequence(i.type):
- input_dims = i.type.sequence_type.elem_type.tensor_type.shape.dim
- else:
- input_dims = i.type.tensor_type.shape.dim
-
- for i_dim, dim in enumerate(input_shape):
- if dim is None:
- # some models use None for symbolic dim in input, replace it with a string
- input_dims[i_dim].dim_param = str(
- self._new_symbolic_dim(i.name, i_dim)
- )
-
- self.input_symbols_.update([d for d in input_shape if type(d) == str])
-
- for s in self.input_symbols_:
- if s in self.suggested_merge_:
- s_merge = self.suggested_merge_[s]
- assert s_merge in self.symbolic_dims_
- self.symbolic_dims_[s] = self.symbolic_dims_[s_merge]
- else:
- # Since inputs are not produced by other ops, we can assume positivity
- self.symbolic_dims_[s] = sympy.Symbol(s, integer=True, positive=True)
- # create a temporary ModelProto for single node inference
- # note that we remove initializer to have faster inference
- # for tensor ops like Reshape/Tile/Expand that read initializer, we need to do sympy computation based inference anyways
- self.tmp_mp_ = onnx.ModelProto()
- self.tmp_mp_.CopyFrom(self.out_mp_)
- self.tmp_mp_.graph.ClearField("initializer")
-
- # compute prerequesite for node for topological sort
- # node with subgraphs may have dependency on implicit inputs, which will affect topological sort
- prereq_for_node = (
- {}
- ) # map from node to all its inputs, including implicit ones in subgraph
-
- def get_prereq(node):
- names = set(i for i in node.input if i)
- subgraphs = []
- if "If" == node.op_type:
- subgraphs = [
- get_attribute(node, "then_branch"),
- get_attribute(node, "else_branch"),
- ]
- elif node.op_type in ["Loop", "Scan"]:
- subgraphs = [get_attribute(node, "body")]
- for g in subgraphs:
- g_outputs_and_initializers = {i.name for i in g.initializer}
- g_prereq = set()
- for n in g.node:
- g_outputs_and_initializers.update(n.output)
- for n in g.node:
- g_prereq.update(
- [
- i
- for i in get_prereq(n)
- if i not in g_outputs_and_initializers
- ]
- )
- names.update(g_prereq)
- # remove subgraph inputs from g_prereq since those are local-only
- for i in g.input:
- if i.name in names:
- names.remove(i.name)
- return names
-
- for n in self.tmp_mp_.graph.node:
- prereq_for_node[n.output[0]] = get_prereq(n)
-
- # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate
- sorted_nodes = []
- sorted_known_vi = set(
- [
- i.name
- for i in list(self.out_mp_.graph.input)
- + list(self.out_mp_.graph.initializer)
- ]
- )
- if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
- # Loop/Scan will have some graph output in graph inputs, so don't do topological sort
- sorted_nodes = self.out_mp_.graph.node
- else:
- while not all(
- [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
- ):
- old_sorted_nodes_len = len(sorted_nodes)
- for node in self.out_mp_.graph.node:
- if (node.output[0] not in sorted_known_vi) and all(
- [
- i in sorted_known_vi
- for i in prereq_for_node[node.output[0]]
- if i
- ]
- ):
- sorted_known_vi.update(node.output)
- sorted_nodes.append(node)
- if old_sorted_nodes_len == len(sorted_nodes) and not all(
- [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
- ):
- raise Exception("Invalid model with cyclic graph")
-
- for node in sorted_nodes:
- assert all([i in self.known_vi_ for i in node.input if i])
- self._onnx_infer_single_node(node)
- known_aten_op = False
- if node.op_type in self.dispatcher_:
- self.dispatcher_[node.op_type](node)
- elif node.op_type in ["ConvTranspose"]:
- # onnx shape inference ops like ConvTranspose may have empty shape for symbolic input
- # before adding symbolic compute for them
- # mark the output type as UNDEFINED to allow guessing of rank
- vi = self.known_vi_[node.output[0]]
- if len(vi.type.tensor_type.shape.dim) == 0:
- vi.type.tensor_type.elem_type = onnx.TensorProto.UNDEFINED
- elif node.op_type == "ATen" and node.domain == "org.pytorch.aten":
- for attr in node.attribute:
- # TODO: Is overload_name needed?
- if attr.name == "operator":
- aten_op_name = (
- attr.s.decode("utf-8")
- if isinstance(attr.s, bytes)
- else attr.s
- )
- if aten_op_name in self.aten_op_dispatcher_:
- known_aten_op = True
- self.aten_op_dispatcher_[aten_op_name](node)
- break
-
- if self.verbose_ > 2:
- logger.debug(node.op_type + ": " + node.name)
- for i, name in enumerate(node.input):
- logger.debug(
- " Input {}: {} {}".format(
- i, name, "initializer" if name in self.initializers_ else ""
- )
- )
-
- # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
- # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
- if node.op_type in [
- "Add",
- "Sub",
- "Mul",
- "Div",
- "MatMul",
- "MatMulInteger",
- "MatMulInteger16",
- "Where",
- "Sum",
- ]:
- vi = self.known_vi_[node.output[0]]
- out_rank = len(get_shape_from_type_proto(vi.type))
- in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
- for d in range(
- out_rank
- - (
- 2
- if node.op_type
- in ["MatMul", "MatMulInteger", "MatMulInteger16"]
- else 0
- )
- ):
- in_dims = [
- s[len(s) - out_rank + d]
- for s in in_shapes
- if len(s) + d >= out_rank
- ]
- if len(in_dims) > 1:
- self._check_merged_dims(in_dims, allow_broadcast=True)
-
- for i_o in range(len(node.output)):
- vi = self.known_vi_[node.output[i_o]]
- out_type = vi.type
- out_type_kind = out_type.WhichOneof("value")
-
- # do not process shape for non-tensors
- if out_type_kind not in ["tensor_type", "sparse_tensor_type", None]:
- if self.verbose_ > 2:
- if out_type_kind == "sequence_type":
- seq_cls_type = out_type.sequence_type.elem_type.WhichOneof(
- "value"
- )
- if "tensor_type" == seq_cls_type:
- logger.debug(
- " {}: sequence of {} {}".format(
- node.output[i_o],
- str(get_shape_from_value_info(vi)),
- onnx.TensorProto.DataType.Name(
- vi.type.sequence_type.elem_type.tensor_type.elem_type
- ),
- )
- )
- else:
- logger.debug(
- " {}: sequence of {}".format(
- node.output[i_o], seq_cls_type
- )
- )
- else:
- logger.debug(
- " {}: {}".format(node.output[i_o], out_type_kind)
- )
- continue
-
- out_shape = get_shape_from_value_info(vi)
- out_type_undefined = (
- out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
- )
- if self.verbose_ > 2:
- logger.debug(
- " {}: {} {}".format(
- node.output[i_o],
- str(out_shape),
- onnx.TensorProto.DataType.Name(
- vi.type.tensor_type.elem_type
- ),
- )
- )
- if node.output[i_o] in self.sympy_data_:
- logger.debug(
- " Sympy Data: " + str(self.sympy_data_[node.output[i_o]])
- )
-
- # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
- if (
- out_shape is not None
- and (
- None in out_shape or self._is_shape_contains_none_dim(out_shape)
- )
- ) or out_type_undefined:
- if self.auto_merge_:
- if node.op_type in [
- "Add",
- "Sub",
- "Mul",
- "Div",
- "MatMul",
- "MatMulInteger",
- "MatMulInteger16",
- "Concat",
- "Where",
- "Sum",
- "Equal",
- "Less",
- "Greater",
- "LessOrEqual",
- "GreaterOrEqual",
- "Min",
- "Max",
- ]:
- shapes = [
- self._get_shape(node, i) for i in range(len(node.input))
- ]
- if node.op_type in [
- "MatMul",
- "MatMulInteger",
- "MatMulInteger16",
- ]:
- if (
- None in out_shape
- or self._is_shape_contains_none_dim(out_shape)
- ):
- if None in out_shape:
- idx = out_shape.index(None)
- else:
- idx = out_shape.index(
- self._is_shape_contains_none_dim(out_shape)
- )
- dim_idx = [
- len(s) - len(out_shape) + idx for s in shapes
- ]
- # only support auto merge for MatMul for dim < rank-2 when rank > 2
- assert (
- len(shapes[0]) > 2
- and dim_idx[0] < len(shapes[0]) - 2
- )
- assert (
- len(shapes[1]) > 2
- and dim_idx[1] < len(shapes[1]) - 2
- )
- elif node.op_type == "Expand":
- # auto merge for cases like Expand([min(batch, 1), min(seq, 512)], [batch, seq])
- shapes = [
- self._get_shape(node, 0),
- self._get_value(node, 1),
- ]
- else:
- shapes = []
-
- if shapes:
- for idx in range(len(out_shape)):
- if out_shape[idx] is not None and not self._is_none_dim(
- out_shape[idx]
- ):
- continue
- # note that the broadcasting rule aligns from right to left
- # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge
- dim_idx = [
- len(s) - len(out_shape) + idx for s in shapes
- ]
- if len(dim_idx) > 0:
- self._add_suggested_merge(
- [
- s[i] if is_literal(s[i]) else str(s[i])
- for s, i in zip(shapes, dim_idx)
- if i >= 0
- ]
- )
- self.run_ = True
- else:
- self.run_ = False
- else:
- self.run_ = False
-
- # create new dynamic dims for ops not handled by symbolic shape inference
- if (
- self.run_ == False
- and not node.op_type in self.dispatcher_
- and not known_aten_op
- ):
- is_unknown_op = out_type_undefined and (
- out_shape is None or len(out_shape) == 0
- )
- if is_unknown_op:
- # unknown op to ONNX, maybe from higher opset or other domain
- # only guess the output rank from input 0 when using guess_output_rank option
- out_rank = (
- self._get_shape_rank(node, 0)
- if self.guess_output_rank_
- else -1
- )
- else:
- # valid ONNX op, but not handled by symbolic shape inference, just assign dynamic shape
- out_rank = len(out_shape)
-
- if out_rank >= 0:
- new_shape = self._new_symbolic_shape(out_rank, node, i_o)
- if out_type_undefined:
- # guess output data type from input vi if not defined
- out_dtype = self.known_vi_[
- node.input[0]
- ].type.tensor_type.elem_type
- else:
- # otherwise, use original data type
- out_dtype = vi.type.tensor_type.elem_type
- vi.CopyFrom(
- helper.make_tensor_value_info(
- vi.name,
- out_dtype,
- get_shape_from_sympy_shape(new_shape),
- )
- )
-
- if self.verbose_ > 0:
- if is_unknown_op:
- logger.debug(
- "Possible unknown op: {} node: {}, guessing {} shape".format(
- node.op_type, node.name, vi.name
- )
- )
- if self.verbose_ > 2:
- logger.debug(
- " {}: {} {}".format(
- node.output[i_o],
- str(new_shape),
- vi.type.tensor_type.elem_type,
- )
- )
-
- self.run_ = True
- continue # continue the inference after guess, no need to stop as no merge is needed
-
- if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
- logger.debug(
- "Stopping at incomplete shape inference at "
- + node.op_type
- + ": "
- + node.name
- )
- logger.debug("node inputs:")
- for i in node.input:
- logger.debug(self.known_vi_[i])
- logger.debug("node outputs:")
- for o in node.output:
- logger.debug(self.known_vi_[o])
- if self.auto_merge_ and not out_type_undefined:
- logger.debug("Merging: " + str(self.suggested_merge_))
- return False
-
- self.run_ = False
- return True
-
- def _update_output_from_vi(self):
- for output in self.out_mp_.graph.output:
- if output.name in self.known_vi_:
- output.CopyFrom(self.known_vi_[output.name])
-
- @staticmethod
- def infer_shapes(
- in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0
- ):
- onnx_opset = get_opset(in_mp)
- if (not onnx_opset) or onnx_opset < 7:
- logger.warning("Only support models of onnx opset 7 and above.")
- return None
- symbolic_shape_inference = SymbolicShapeInference(
- int_max, auto_merge, guess_output_rank, verbose
- )
- all_shapes_inferred = False
- symbolic_shape_inference._preprocess(in_mp)
- while symbolic_shape_inference.run_:
- all_shapes_inferred = symbolic_shape_inference._infer_impl()
- symbolic_shape_inference._update_output_from_vi()
- if not all_shapes_inferred:
- logger.warning("Incomplete symbolic shape inference")
- return symbolic_shape_inference.out_mp_
-
-
-def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument("--input", required=True, help="The input model file")
- parser.add_argument("--output", help="The output model file")
- parser.add_argument(
- "--auto_merge",
- help="Automatically merge symbolic dims when confliction happens",
- action="store_true",
- default=False,
- )
- parser.add_argument(
- "--int_max",
- help="maximum value for integer to be treated as boundless for ops like slice",
- type=int,
- default=2**31 - 1,
- )
- parser.add_argument(
- "--guess_output_rank",
- help="guess output rank to be the same as input 0 for unknown ops",
- action="store_true",
- default=False,
- )
- parser.add_argument(
- "--verbose",
- help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
- type=int,
- default=0,
- )
- parser.add_argument(
- "--save_as_external_data",
- help="Saving an ONNX model to external data",
- action="store_true",
- default=False,
- )
- parser.add_argument(
- "--all_tensors_to_one_file",
- help="Saving all the external data to one file",
- action="store_true",
- default=False,
- )
- parser.add_argument(
- "--external_data_location",
- help="The file location to save the external file",
- default="./",
- )
- parser.add_argument(
- "--external_data_size_threshold",
- help="The size threshold for external data",
- type=int,
- default=1024,
- )
- return parser.parse_args()
-
-
-if __name__ == "__main__":
- args = parse_arguments()
- logger.info("input model: " + args.input)
- if args.output:
- logger.info("output model " + args.output)
- logger.info("Doing symbolic shape inference...")
- out_mp = SymbolicShapeInference.infer_shapes(
- onnx.load(args.input),
- args.int_max,
- args.auto_merge,
- args.guess_output_rank,
- args.verbose,
- )
- if args.output and out_mp:
- if args.save_as_external_data:
- onnx.save_model(
- out_mp,
- args.output,
- save_as_external_data=True,
- all_tensors_to_one_file=args.all_tensors_to_one_file,
- location=args.external_data_location,
- size_threshold=args.external_data_size_threshold,
- convert_attribute=False,
- )
- else:
- onnx.save(out_mp, args.output)
- logger.info("Done!")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
deleted file mode 100644
index b80f9f4022328703df32af16182ea930645a6db6..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-
-onnxsim
-packaging
-sympy