diff --git a/models/cv/classification/swin_transformer_large/ixrt/README.md b/models/cv/classification/swin_transformer_large/ixrt/README.md
index f09a5c125c0dc5a021aa2d2d1fd25961d08bf02b..032c961d88878d66c4c830cac3b3273545d80295 100644
--- a/models/cv/classification/swin_transformer_large/ixrt/README.md
+++ b/models/cv/classification/swin_transformer_large/ixrt/README.md
@@ -10,8 +10,6 @@ Swin Transformer-Large is a variant of the Swin Transformer, an architecture des
 | :----: | :----: | :----: |
 | MR-V100 | 4.2.0     |  25.03  |
 
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
 ## Model Preparation
 
 ### Prepare Resources
@@ -53,6 +51,7 @@ python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/swin-large/s
 
 ```bash
 git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 export ORIGIN_ONNX_NAME=./swin-large-torch-fp32
 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh b/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh
index b7fe2e695819ea348d0045c5774cc5e7af8037f2..02ac2c462036a5839f09d7448278098897d21679 100644
--- a/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh
+++ b/models/cv/classification/swin_transformer_large/ixrt/ci/prepare.sh
@@ -26,6 +26,8 @@ else
 fi
 
 apt install -y libnuma-dev
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 pip install -r requirements.txt
 mkdir -p general_perf/model_zoo/regular
diff --git a/models/nlp/plm/albert/ixrt/README.md b/models/nlp/plm/albert/ixrt/README.md
index 5944c1d15e499710580328e7b981568e83916586..778719bddff35be6d4fc5136b18e6efcc3d96da5 100644
--- a/models/nlp/plm/albert/ixrt/README.md
+++ b/models/nlp/plm/albert/ixrt/README.md
@@ -10,8 +10,6 @@ Albert (A Lite BERT) is a variant of the BERT (Bidirectional Encoder Representat
 | :----: | :----: | :----: |
 | MR-V100 | 4.2.0     |  25.03  |
 
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
 ## Model Preparation
 
 ### Prepare Resources
@@ -51,6 +49,7 @@ onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
 
 ```bash
 git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 export ORIGIN_ONNX_NAME=./albert-torch-fp32-sim
 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/albert/ixrt/ci/prepare.sh b/models/nlp/plm/albert/ixrt/ci/prepare.sh
index 68e8aa19da2132447fdfe6ea48f42bc026f48d7c..9e0dc3b925183fc0ca18848d3dd31cdec4bdf2f1 100644
--- a/models/nlp/plm/albert/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/albert/ixrt/ci/prepare.sh
@@ -21,6 +21,8 @@ apt install -y libnuma-dev
 pip3 install -r requirements.txt
 
 cp /root/data/3rd_party/albert-torch-fp32.json ./
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 python3 torch2onnx.py --model_path /root/data/checkpoints/open_albert/albert-base-squad.pt --output_path albert-torch-fp32.onnx
 onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
diff --git a/models/nlp/plm/deberta/ixrt/README.md b/models/nlp/plm/deberta/ixrt/README.md
index b683a4cbde82ad5c7fa2d7964824ba2f6489afee..87496848406c894ec31d9886f4bbc6c6123980c1 100644
--- a/models/nlp/plm/deberta/ixrt/README.md
+++ b/models/nlp/plm/deberta/ixrt/README.md
@@ -15,8 +15,6 @@ fine-tuning to better suit specific downstream tasks, thereby improving the mode
 | :----: | :----: | :----: |
 | MR-V100 | 4.2.0     |  25.03  |
 
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
 ## Model Preparation
 
 ### Prepare Resources
@@ -55,6 +53,7 @@ python3 remove_clip_and_cast.py
 
 ```bash
 git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 export ORIGIN_ONNX_NAME=./deberta-sim-drop-clip-drop-invaild-cast
 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/deberta/ixrt/ci/prepare.sh b/models/nlp/plm/deberta/ixrt/ci/prepare.sh
index d440393e7ed913ae6a92fc0ab043a5744086f8c1..23ecd2b5bc02b6076db66490f28ab18efe07b86f 100644
--- a/models/nlp/plm/deberta/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/deberta/ixrt/ci/prepare.sh
@@ -21,6 +21,8 @@ apt install -y libnuma-dev
 pip install -r requirements.txt
 
 cp /root/data/3rd_party/deberta-torch-fp32.json ./
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 python3 torch2onnx.py --model_path /root/data/checkpoints/open_deberta/deberta-base-squad.pt --output_path deberta-torch-fp32.onnx
 onnxsim deberta-torch-fp32.onnx deberta-torch-fp32-sim.onnx
diff --git a/models/nlp/plm/roberta/ixrt/README.md b/models/nlp/plm/roberta/ixrt/README.md
index acd1b45869ad0103681a7e65488071e52494576f..92cc8e4eb8dfbb8e3490eab6aabaf38134c731b9 100644
--- a/models/nlp/plm/roberta/ixrt/README.md
+++ b/models/nlp/plm/roberta/ixrt/README.md
@@ -17,8 +17,6 @@ our models and code.
 | :----: | :----: | :----: |
 | MR-V100 | 4.2.0     |  25.03  |
 
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
 ## Model Preparation
 
 ### Prepare Resources
@@ -62,6 +60,7 @@ onnxsim open_roberta/roberta-torch-fp32.onnx open_roberta/roberta-torch-fp32_sim
 
 ```bash
 git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 export ORIGIN_ONNX_NAME=./open_roberta/roberta-torch-fp32_sim
 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/roberta/ixrt/ci/prepare.sh b/models/nlp/plm/roberta/ixrt/ci/prepare.sh
index 81d02ab0621e5c06580fe8469fc9c2012ca3c3ee..5f00f9e9ac7096d7d17d9c1a50cd416c6db432de 100644
--- a/models/nlp/plm/roberta/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/roberta/ixrt/ci/prepare.sh
@@ -19,6 +19,8 @@ set -x
 apt install -y libnuma-dev
 
 pip install -r requirements.txt
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 mkdir -p data
 cp -r /root/data/checkpoints/open_roberta data/
diff --git a/models/nlp/plm/roformer/ixrt/README.md b/models/nlp/plm/roformer/ixrt/README.md
index 890158fa2f42669484032186d1333b1187a9a860..5d37b5e6eb6ac8d7c0ce107ee5b248e64ba96a11 100644
--- a/models/nlp/plm/roformer/ixrt/README.md
+++ b/models/nlp/plm/roformer/ixrt/README.md
@@ -19,8 +19,6 @@ datasets.
 | :----: | :----: | :----: |
 | MR-V100 | 4.2.0     |  25.03  |
 
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
 ## Model Preparation
 
 ### Prepare Resources
@@ -68,6 +66,7 @@ python3 deploy.py --model_path ./data/open_roformer/roformer-frozen.onnx --outpu
 
 ```bash
 git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 export ORIGIN_ONNX_NAME=./data/open_roformer/roformer-frozen
 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/roformer/ixrt/ci/prepare.sh b/models/nlp/plm/roformer/ixrt/ci/prepare.sh
index ea80462db022331cb8b9c20f12a15e9ef8b0bdd6..deda09efeb451ceafa37daf0b0f519e209e9249f 100644
--- a/models/nlp/plm/roformer/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/roformer/ixrt/ci/prepare.sh
@@ -19,6 +19,8 @@ set -x
 apt install -y libnuma-dev
 
 pip install -r requirements.txt
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 mkdir -p data
 cp -r /root/data/checkpoints/open_roformer data/
diff --git a/models/nlp/plm/roformer/ixrt/export_onnx.py b/models/nlp/plm/roformer/ixrt/export_onnx.py
index 475dddd7c2ab27b6ca342be98ea92d2c791ff60b..a0213bb449c7d632fdda2b43279037d6883f3424 100644
--- a/models/nlp/plm/roformer/ixrt/export_onnx.py
+++ b/models/nlp/plm/roformer/ixrt/export_onnx.py
@@ -16,7 +16,7 @@
 import tf2onnx
 from tf2onnx import tf_loader
 import argparse
-ONNX_OPSET = 11
+ONNX_OPSET = 13
 
 def _convert_graphdef_to_onnx(graph_def,
     inputs=None,
diff --git a/models/nlp/plm/videobert/ixrt/README.md b/models/nlp/plm/videobert/ixrt/README.md
index 2f47a69bf90d4bc9d3e04fc17a457e260a6530c4..ded0114471da00dded55f6910c833998411cba4c 100644
--- a/models/nlp/plm/videobert/ixrt/README.md
+++ b/models/nlp/plm/videobert/ixrt/README.md
@@ -12,8 +12,6 @@ and textual information into a unified framework.
 | :----: | :----: | :----: |
 | MR-V100 | 4.2.0     |  25.03  |
 
-**This model is compatible with IXUCA SDK up to version 4.2.0.**
-
 ## Model Preparation
 
 ### Prepare Resources
@@ -43,6 +41,7 @@ pip3 install -r requirements.txt
 
 ```bash
 git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 export ORIGIN_ONNX_NAME=./general_perf/model_zoo/popular/open_videobert/video-bert
 export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
diff --git a/models/nlp/plm/videobert/ixrt/ci/prepare.sh b/models/nlp/plm/videobert/ixrt/ci/prepare.sh
index 0d46c6c023fc58658a230714d3a1b06cc9430c2b..7d5f8fa49779ce6d6b52d088cca2ad0ce4a9dd5a 100644
--- a/models/nlp/plm/videobert/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/videobert/ixrt/ci/prepare.sh
@@ -19,6 +19,8 @@ set -x
 apt install -y libnuma-dev
 
 pip install -r requirements.txt
+git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1
+cp -r iluvatar-corex-ixrt/tools/optimizer/ ../../../../../toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/
 
 mkdir -p data
 cp -r /root/data/checkpoints/open_videobert data/
diff --git a/tests/model_info.json b/tests/model_info.json
index e230305fff2fbed1fb15d7f585ed35b9ac71d43d..58adf01b0326d3a63a27d3bf3b1d31ebde2b15d3 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -3003,8 +3003,8 @@
             "release_version": "25.03",
             "release_sdk": "CoreX 4.2.0",
             "release_gpgpu": "MR-V100",
-            "latest_sdk": "4.2.0",
-            "latest_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
             "category": "cv/classification",
             "toolbox": "",
             "mdims": "",
@@ -5827,8 +5827,8 @@
             "release_version": "24.09",
             "release_sdk": "4.1.2",
             "release_gpgpu": "MR-V100",
-            "latest_sdk": "4.2.0",
-            "latest_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
             "category": "nlp/plm",
             "toolbox": "",
             "mdims": "",
@@ -6025,8 +6025,8 @@
             "release_version": "24.09",
             "release_sdk": "4.1.2",
             "release_gpgpu": "MR-V100",
-            "latest_sdk": "4.2.0",
-            "latest_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
             "category": "nlp/plm",
             "toolbox": "",
             "mdims": "",
@@ -6058,8 +6058,8 @@
             "release_version": "24.09",
             "release_sdk": "4.1.2",
             "release_gpgpu": "MR-V100",
-            "latest_sdk": "4.2.0",
-            "latest_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
             "category": "nlp/plm",
             "toolbox": "",
             "mdims": "",
@@ -6091,8 +6091,8 @@
             "release_version": "24.09",
             "release_sdk": "4.1.2",
             "release_gpgpu": "MR-V100",
-            "latest_sdk": "4.2.0",
-            "latest_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
             "category": "nlp/plm",
             "toolbox": "",
             "mdims": "",
@@ -6124,8 +6124,8 @@
             "release_version": "24.09",
             "release_sdk": "4.1.2",
             "release_gpgpu": "MR-V100",
-            "latest_sdk": "4.2.0",
-            "latest_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
             "category": "nlp/plm",
             "toolbox": "",
             "mdims": "",
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index eb25acab7388ad14c509fd48a0862ff0bbec7f32..df6f59e122c6e529d05b1ff7fc20f6ea46fd35e6 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -189,7 +189,7 @@ def run_clf_testcase(model):
             script = f"""
             cd ../{model['model_path']}
             export ORIGIN_ONNX_NAME=./swin-large-torch-fp32
-            export OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py
+            export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
             export PROJ_PATH=./
             bash scripts/infer_swinl_fp16_performance.sh
             cd ./ByteMLPerf/byte_infer_perf/general_perf
@@ -450,7 +450,7 @@ def run_nlp_testcase(model):
         set -x
         cd ../{model['model_path']}
         export ORIGIN_ONNX_NAME=./data/open_{model_name}/{model_name}
-        export OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py
+        export OPTIMIER_FILE=./iluvatar-corex-ixrt/tools/optimizer/optimizer.py
         export PROJ_PATH=./
         bash scripts/infer_{model_name}_{prec}_performance.sh
         cd ./ByteMLPerf/byte_infer_perf/general_perf
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md
deleted file mode 100755
index 3d1318032a7b03971285a05b997d3275c0d3c3cf..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/README.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# IxRT optimizer
-
-## 1. optimizer 简介
-
-`optimizer` 是一个 ixrt 中集成的图融合工具，用于将onnx图中的op融合成对应的IxRT plugin，一般与 IxRT 配合进行使用；
-
-## 2. optimizer 功能说明
-
-| 功能       | 说明                                                         |
-| ---------- | ------------------------------------------------------------ |
-| 动态图支持 | 支持融合动态图和静态图                                       |
-| 模型支持   | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert, yolov5s, visionTransformer, gpt2模型，其他模型暂不推荐使用该工具 |
-
-## 3. optimizer 运行参数
-
-| 参数             | 说明                                                         |
-| ---------------- | ------------------------------------------------------------ |
-| `--onnx`         | 必选 ，指定要运行的 onnx 模型路径                            |
-| `--num_heads`    | 可选 ，指定模型对应Attention模块注意力头的个数               |
-| `--hidden_size`  | 可选， 模型模型隐藏层的大小                                  |
-| `--input_shapes` | 可选 ，固定动态模型的输入形状，以从静态形状推理，示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
-| `--dump_onnx`    | 可选 ，用于图融合过程中dump出中间的onnx图，生成 _sim 结尾的 onnx 模型 |
-| `--model_type`   | 可选 ，可以指定要融合的模型类型，默认是"bert", 可选["bert", "swint", "roformer", "yolo", "gpt2", "vit"] |
-| `--log_level`    | 可选 ，指定IxRT运行时显示日志的等级， 可指定为debug、info、error，默认为 info |
-
-
-## 4. 运行示例
-
-###  4.1 示例1：融合albert|videobert|roberta|deberta
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH}
-```
-
-###  4.2 示例2：融合swinL
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
-```
-
-###  4.3 示例3：融合roformer
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
-```
-
-###  4.4 示例4：融合yolov5s
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --model_type yolo
-```
-
-### 4.5 精度验证
-
-#### 4.5.1 示例1：albert模型
-
-模型变量示例：
-
-```
-MODEL_PATH="data/albert/albert-base-squad.onnx"
-MODEL_END_PATH="data/albert/albert-base-squad_end.onnx"
-MODEL_ENGINE_PATH="data/albert/albert-base-squad_end.engine"
-```
-
-运行命令
-
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --dump_onnx
-ixrtexec --onnx ${MODEL_END_PATH} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
-                                  --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
-                                  --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
-                                  --save_engine ${MODEL_ENGINE_PATH} --log_level verbose --plugins ixrt_plugin
-ixrtexec --load_engine ${MODEL_ENGINE_PATH} --ort_onnx ${MODEL_PATH} --plugins ixrt_plugin --verify_acc
-```
-
-#### 4.5.2 示例2：swinL模型
-
-模型变量示例：
-
-```
-BS=1
-MODEL_PATH="data/swint/swin-transformer-large.onnx"
-MODEL_END_PATH = "data/swint/swin-transformer-large_end.onnx"
-MODEL_ENGINE_PATH = "data/swint/swin-transformer-large_end.engine"
-MODEL_SIM_STATIC_SIM_PATH = "data/swint/swin-transformer-large_sim_static_sim.onnx"
-```
-
-运行命令
-
-```bash
-cd oss/tools/optimizer
-# 固定输入形状为 ${BS}x3x384x384
-python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint --dump_onnx
-
-# Build engine
-ixrtexec --onnx ${MODEL_END_PATH} --save_engine ${MODEL_ENGINE_PATH} --log_level verbose --plugins ixrt_plugin
-
-# 测试性能
-ixrtexec --load_engine ${MODEL_ENGINE_PATH} --plugins ixrt_plugin
-
-# 测试精度
-ixrtexec --load_engine ${MODEL_ENGINE_PATH} --ort_onnx ${MODEL_SIM_STATIC_SIM_PATH} --plugins ixrt_plugin --verify_acc
-```
-
-请参考[高级话题](5_advanced_topics.md)中的<u>精度对比工具</u>一节，了解详细使用方法和原理。
-
-也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
-
-具体使用方法可以参考oss/samples
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
deleted file mode 100644
index de522e5b082b122a28b0a0423a40909598aa82d5..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
deleted file mode 100644
index 65175643c0e50d8445ef65deae088de4600244f0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-## CI Test tool for IxRT
-
-### 1. Install dltest tool
-    
-    python setup.py develop
-
-### 2. Usage
-
-#### 2.1 Fetch log
-
-Commmand:
-
-```shell
-ixdltest-fetch args_or_pipe ${log_path}
-```
-
-Arguments:
-
-- p or patterns, The pattern of fetch log;
-- pn or pattern_names, The name of pattern;
-- use_re, Whether use regular expression;
-- d or nearest_distance, default=10, The nearest distance of matched pattern;
-- start_flag, The flag of start to record log;
-- end_flag, The flag of stop to record log;
-- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
-- split_sep, The seperator is used to split line;
-- split_idx, The index of split line;
-- saved, Save result to path;
-- log, Log path.
-
-Example
-Analyse from file
-```
-$ ixdltest-fetch run.log -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
-{'results': [{'Throughput': [188.5461778786721]}]}
-- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
-```
-
-Analyse from command line pipe
-```
-$ cat run.log | ixdltest-fetch -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
-{'results': [{'Throughput': [188.5461778786721]}]}
-- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
-```
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
deleted file mode 100644
index 5458f31666f11de72d52a4e834b8a87be9a992d0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .utils.infer_args import show_infer_arguments
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
deleted file mode 100644
index 182e895c7fe902a31fc982fab6f96e0c55125c4a..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-from typing import List, Iterable, Optional
-
-from dltest.cli.log_parser_cli import LogParserCLI
-from dltest.log_parser import LogParser
-from dltest.model_compare_config import get_compare_config_with_full_path
-from dltest.utils.misc import get_full_path
-from dltest.utils.subprocess_tools import get_output
-from dltest.model_compare_config import ComparatorConfig
-
-
-FRAMEWORKS = list(ComparatorConfig.get_frameworks())
-
-REMAINDER = '...'
-
-assertion_expr_factory = dict(
-    eq = "a == b",
-    ne = "a != b",
-    ge = "a >= b",
-    le = "a <= b",
-    gt = "a > b",
-    lt = "a < b",
-)
-
-
-class AssertCLI(LogParserCLI):
-
-    def command_name(self):
-        return "assert"
-
-    def predefine_args(self):
-        super(AssertCLI, self).predefine_args()
-        self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None,
-                                 help='It is used in assertion expression.')
-        self.parser.add_argument('--print_result', action="store_true", default=False,
-                                 help='Whether print result')
-        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'],
-                                 help='The method of capture output')
-        # FIXME: Using store_action to replase it
-        self.parser.add_argument('--only_last', type=int, default=0,
-                                 help='Whether use the last result to compare')
-        self.parser.add_argument('--expr', type=str, default="ge",
-                                 help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" +
-                                 ", or a executable code, such as `a > b`, `a > 1`, ...")
-        self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False,
-                                 help='Whether use predefined args of parser.')
-        self.parser.add_argument('--log', type=str, default=None, help="Log path")
-        self.parser.add_argument("--run_script", default=[], nargs=REMAINDER)
-
-    def parse_args(self, *args, **kwargs):
-        args = super(AssertCLI, self).parse_args()
-        args.only_last = args.only_last > 0
-        if len(args.run_script) == 0 and args.log is None:
-            raise ValueError("The one of `--run_script` or `--log` must be given.")
-
-        if args.assertion_second_value is None:
-            if args.expr is None:
-                raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.")
-
-            if args.expr in assertion_expr_factory:
-                raise ValueError(
-                    "The comparison operators depend on the argument `assertion_second_value`."
-                )
-
-        return args
-
-    def create_parser(self, args):
-        if args.use_predefined_parser_rules:
-            script_path = self._get_script_path(args.run_script)
-            config = get_compare_config_with_full_path(script_path, to_dict=False)
-
-            return LogParser(
-                patterns=config.patterns, pattern_names=config.pattern_names,
-                use_re=config.use_re, nearest_distance=config.nearest_distance,
-                start_line_pattern_flag=config.start_line_pattern_flag,
-                end_line_pattern_flag=config.end_line_pattern_flag,
-                split_pattern=config.split_pattern,
-                split_sep=config.split_sep,
-                split_idx=config.split_idx
-            )
-
-        return LogParser(
-            patterns=args.patterns, pattern_names=args.pattern_names,
-            use_re=args.use_re, nearest_distance=args.nearest_distance,
-            start_line_pattern_flag=args.start_flag,
-            end_line_pattern_flag=args.end_flag,
-            split_pattern=args.split_pattern,
-            split_sep=args.split_sep,
-            split_idx=args.split_idx
-        )
-
-    def run(self):
-        args = self.parse_args()
-        parser = self.create_parser(args)
-
-        if args.print_result:
-            print(args)
-
-        output = self.get_log(args)
-        parsed_logs = self.parser_log(parser, output, args)
-        self.check_logs(parsed_logs, args)
-
-    def get_log(self, args):
-        if len(args.run_script) == 0:
-            try:
-                with open(args.log) as f:
-                    return f.readlines()
-            except:
-                print(f"ERROR: Read log fail in {args.log}")
-                exit(1)
-        else:
-            return get_output(args.run_script, capture_output_method=args.capture_output)
-
-    def parser_log(self, parser, output, args) -> List[float]:
-        results = parser.parse(output)
-        if args.only_last:
-            results = results[-1:]
-
-        if len(results) == 0:
-            raise ValueError("The parsed results is empty, please check patterns.")
-        if isinstance(results[0], dict):
-            if len(results[0]) == 0:
-                raise ValueError("The parsed results is empty, please check patterns.")
-            key = list(results[0].keys())[0]
-            results = [result[key] for result in results]
-
-        if isinstance(results[0], Iterable):
-            results = [result[0] for result in results]
-
-        return results
-
-    def check_logs(self, parsed_logs, args):
-        if args.print_result:
-            print("Parsed result:", parsed_logs)
-
-        assertion_expr = assertion_expr_factory.get(args.expr, args.expr)
-
-        assert_results = []
-        b = args.assertion_second_value
-        for a in parsed_logs:
-            assert_results.append(eval(assertion_expr))
-
-        if args.print_result:
-            print("The result of assertion expression:", assert_results)
-
-        if any(assert_results):
-            print("SUCCESS")
-            exit(0)
-        print("FAIL")
-        exit(1)
-
-    def _get_script_path(self, run_script: List[str]):
-        # Find shell script by current run_script
-        def _find_real_shell_script(cmd: List[str]):
-            for i, field in enumerate(cmd):
-                if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS:
-                    return field
-
-        real_shell_script = _find_real_shell_script(run_script)
-
-        # Find shell script by parent process
-        if real_shell_script is None:
-            ppid = os.getppid()
-            import psutil
-            pproc = psutil.Process(ppid)
-            pproc_cmd = pproc.cmdline()
-            real_shell_script = _find_real_shell_script(pproc_cmd)
-
-        if real_shell_script is not None:
-            real_shell_script = self._get_script_abs_path(real_shell_script)
-            return real_shell_script
-
-        raise RuntimeError("The script is not named correctly, " + \
-                           "please use a script name ending with the framework, " + \
-                           f"got `{' '.join(run_script)}`, " + \
-                           "e.g. train_resnet50_torch.sh")
-
-    def _get_framework(self, shell_script: str) -> Optional[str]:
-        try:
-            return shell_script.split('.')[-2].split('_')[-1]
-        except:
-            return None
-
-    def _get_script_abs_path(self, run_script):
-        real_run_script = os.path.realpath(run_script)
-        if os.path.exists(real_run_script):
-            return real_run_script
-
-        if "MODEL_DIR" in os.environ:
-            return os.path.join(os.environ["MODEL_DIR"], run_script)
-
-        if "OLDPWD" in os.environ:
-            real_run_script = os.path.join(os.environ["OLDPWD"], run_script)
-            if os.path.exists(real_run_script):
-                return real_run_script
-
-        raise FileNotFoundError("Not found running script path, " + \
-                                "please set environment variable `MODEL_DIR`, " + \
-                                "e.g /path/to/deeplearningsamples/executables/resnet.")
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
deleted file mode 100644
index b40f3a72fb949c18104963fb598c58076c65b479..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-
-from .assert_cli import AssertCLI
-from ..utils.subprocess_tools import execute_shell
-
-RUN_MODE_KEY = "RUN_MODE"
-RUN_MODE_STRICT = "strict"
-
-
-class CheckCli(AssertCLI):
-
-    def __init__(self, *args, **kwargs):
-        super(CheckCli, self).__init__(*args, **kwargs)
-        self.args = None
-
-    def command_name(self):
-        return "check"
-
-    def predefine_args(self):
-        self.parser.add_argument("--check_mode", type=str, default="no",
-                                 choices=["all", "strict", "nonstrict", "no"],
-                                 help="which running mode needs to be checked")
-        self.parser.add_argument("--nonstrict_mode_args", type=str, default="",
-                                 help="the arguments are used with nonstric testing")
-        super(CheckCli, self).predefine_args()
-
-    def parse_args(self, *args, **kwargs):
-        if self.args is None:
-            args = super(CheckCli, self).parse_args(*args, **kwargs)
-            args.use_predefined_parser_rules = True
-            args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ")
-
-            if not self.is_strict_testing():
-                args.run_script.extend(args.nonstrict_mode_args)
-
-            if args.check_mode == "all":
-                args.check_mode = self.current_running_mode()
-
-            self.args = args
-        return self.args
-
-    def run(self):
-        args = self.parse_args()
-        if args.check_mode == self.current_running_mode():
-            return super(CheckCli, self).run()
-        else:
-            res = execute_shell(args.run_script)
-            exit(res.returncode)
-
-    def current_running_mode(self):
-        return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT)
-
-    def is_strict_testing(self):
-        return self.current_running_mode() == RUN_MODE_STRICT
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
deleted file mode 100644
index c631f332b6a46c43c7891e4925d011e49741dc5d..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from dltest.cli.assert_cli import AssertCLI
-from dltest.cli.log_comparator_cli import LogComparatorCLI
-from dltest.cli.model_validator_cli import ModelValidatorCLI
-from dltest.cli.fetch_log_cli import FetchLog
-from dltest.cli.check_cli import CheckCli
-
-
-#log_comparator_cli = LogComparatorCLI()
-#model_validator_cli = ModelValidatorCLI()
-fetch_log_cli = FetchLog()
-#assert_cli = AssertCLI()
-#check_cli = CheckCli()
-
-
-def make_execute_path():
-    preffix = "dltest.cli.entry_points"
-    clis = []
-    for cli_var in globals():
-        if cli_var.endswith('_cli'):
-            cmd_name = globals()[cli_var].command_name()
-            clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}")
-
-    return clis
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
deleted file mode 100644
index 41f3c3cac3151b61362b3ff57609df0f64896181..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import json
-import sys
-from typing import Mapping
-from os.path import basename, join, exists, expanduser, dirname
-
-from dltest.log_parser import LogParser
-from dltest.cli.log_parser_cli import LogParserCLI
-from dltest.utils.iluvatar import get_iluvatar_card_type, IluvatarGPU
-
-
-
-
-def parse_target(target):
-    result = {}
-    targets = target.split(",")
-    for i in targets:
-        item = i.split(":")
-        assert len(item) == 2
-        key, value = item
-        result[key] = float(value)
-    return result
-        
-
-def load_json(file):
-    file_path = expanduser(file)
-    # 检查文件是否存在
-    if exists(file_path):
-        # 加载json文件
-        with open(file_path, 'r') as file:
-            data = json.load(file)
-    else:
-        # 创建一个空的json文件
-        data = {}
-
-    return data
-
-def process_results(results):
-    result = dict()
-    for i in results["results"]:
-        for k, v in i.items():
-            result[k] = v[0]
-    return result
-
-class FetchLog(LogParserCLI):
-
-    def command_name(self):
-        return "fetch"
-
-    def predefine_args(self):
-        super(FetchLog, self).predefine_args()
-        self.parser.add_argument('log', nargs='?', type=str, help="Log path")
-        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
-        self.parser.add_argument('--saved_entry', type=str, default=None, help='Save to path')
-        self.parser.add_argument('-t_bi150','--target_bi150', type=str, default=-1.)
-        self.parser.add_argument('-t_mr100','--target_mr100', type=str, default=-1.)
-        self.parser.add_argument('-t_mr50','--target_mr50', type=str, default=-1.)
-
-    def run(self):
-        args = self.parse_args()
-        parser = LogParser(
-            patterns=args.patterns, pattern_names=args.pattern_names,
-            use_re=args.use_re, nearest_distance=args.nearest_distance,
-            start_line_pattern_flag=args.start_flag,
-            end_line_pattern_flag=args.end_flag,
-            split_pattern=args.split_pattern,
-            split_sep=args.split_sep,
-            split_idx=args.split_idx
-        )
-
-        results = parser.parse(args.log)
-        if not isinstance(results, Mapping):
-            results = dict(results=results)
-        results = process_results(results)
-        print(results)
-
-        if args.saved is not None:
-            saved = load_json(args.saved)
-            if not args.saved_entry:
-                raise Exception("You need to use --saved_entry to specify entry name of the result")
-
-            saved[args.saved_entry] = results
-            with open(args.saved, 'w') as f:
-                json.dump(saved, f, indent=4)
-        self.compare_results(args, results)
-
-
-    def compare_results(self, args, results):
-        card = get_iluvatar_card_type()
-        if card == IluvatarGPU.UNKNOWN:
-            print("Not known which card is used, can you use ixsmi in the environment?")
-            return
-        user_target = getattr(args, 'target_'+card.name.lower(), "")
-        user_target = parse_target(user_target)
-
-        is_expected = True
-        for key, target in user_target.items():
-            if key not in results:
-                continue
-            if results[key]<target:
-                is_expected = False
-                print(f"- Check {key} on {card.name} failed (result vs target): {results[key]}<{target}")
-            else:
-                print(f"- Check {key} on {card.name} passed (result vs target): {results[key]}>={target}")
-        if not is_expected:
-            sys.exit(1)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
deleted file mode 100644
index cac8a0a684440371ece5067086cd75eed939f482..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import json
-from pprint import pprint
-
-from dltest.cli.log_parser_cli import LogParserCLI
-from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS
-
-
-class LogComparatorCLI(LogParserCLI):
-
-    def command_name(self):
-        return "compare"
-
-    def predefine_args(self):
-        super(LogComparatorCLI, self).predefine_args()
-        self.parser.add_argument('--log1', type=str, help="First log")
-        self.parser.add_argument('--log2', type=str, help="Second log")
-        self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold")
-        self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare')
-        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
-        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
-        self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2')
-
-    def parse_args(self, *args, **kwargs):
-        args = super(LogComparatorCLI, self).parse_args(*args, **kwargs)
-        args.only_last = args.only_last >= 1
-
-        return args
-
-    def run(self):
-        args = self.parse_args()
-        satisfied, results = compare_logs_with_paths(
-            log1=args.log1, log2=args.log2,
-            threshold=args.threshold,
-            patterns=args.patterns, pattern_names=args.pattern_names,
-            use_re=args.use_re, nearest_distance=args.nearest_distance,
-            start_line_pattern_flag=args.start_flag,
-            end_line_pattern_flag=args.end_flag,
-            only_last=args.only_last,
-            split_pattern=args.split_pattern,
-            split_sep=args.split_sep,
-            split_idx=args.split_idx,
-            allow_greater_than=True
-        )
-
-        if args.print_result:
-            pprint(results)
-
-        if satisfied:
-            print("SUCCESS")
-        else:
-            print("FAIL")
-
-        if args.saved is not None:
-            with open(args.saved, 'w') as f:
-                json.dump(results, f)
-
-
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
deleted file mode 100644
index d2e2dd1be2d305a83a2969b5d4dbfbfeef2d9fd0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import json
-from typing import Mapping
-
-from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
-from dltest.utils.base_cli import BaseCLI
-
-
-class LogParserCLI(BaseCLI):
-
-    def predefine_args(self):
-        self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns')
-        self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern')
-        self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression')
-        self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern')
-        self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log')
-        self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log')
-        self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line')
-        self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line')
-        self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line')
-
-    def parse_args(self, *args, **kwargs):
-        args = super(LogParserCLI, self).parse_args(*args, **kwargs)
-
-        return args
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
deleted file mode 100644
index 8d0d77d97d8f4f0d4d3528418c886884fa262575..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import json
-import os
-import os.path as ospath
-from pprint import pprint
-from typing import List, Union
-
-from dltest.utils.base_cli import BaseCLI
-from dltest.utils.get_env import get_gpu_type
-from dltest.utils.misc import get_full_path
-from dltest.model_compare_config import get_compare_config_with_full_path
-from dltest.log_comparator import compare_logs_with_paths
-from dltest.utils.subprocess_tools import get_output
-
-
-REMAINDER = '...'
-
-
-class ModelValidatorCLI(BaseCLI):
-
-    def command_name(self):
-        return "validate"
-
-    def predefine_args(self):
-        super(ModelValidatorCLI, self).predefine_args()
-        self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log")
-        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
-        self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared")
-        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
-        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output')
-        self.parser.add_argument("run_script", nargs=REMAINDER)
-
-    def parse_args(self, *args, **kwargs):
-        args = super(ModelValidatorCLI, self).parse_args()
-        if len(args.run_script) == 0:
-            print("ERROR: Invalid run_script")
-            exit(1)
-
-        return args
-
-    def run(self):
-        args = self.parse_args()
-        output = self._run_script(args.run_script, capture_output_method=args.capture_output)
-        self.compare_logs(
-            output, args.compare_log, args.run_script,
-            args.saved, args.with_exit_code,
-            args.print_result
-        )
-
-    def compare_logs(self, output: List, compare_log: str,
-                     run_script: List[str], saved: str=None,
-                     with_exit_code: int=1, print_result=False):
-        script_path = self._get_script_path(run_script)
-        script_path = get_full_path(script_path)
-        compare_args = get_compare_config_with_full_path(script_path)
-
-        if compare_log is None:
-            epoch = self._get_epoch(run_script)
-            script_name = ospath.basename(script_path)
-            dist_tag = self._get_dist_tag(script_name)
-            compare_log = self._find_comparable_log(script_path, epoch, dist_tag)
-
-            if not ospath.exists(compare_log):
-                print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.")
-                exit(1)
-
-        compare_args['log1'] = output
-        compare_args['log2'] = compare_log
-
-        satisfied, results = compare_logs_with_paths(**compare_args)
-
-        if print_result:
-            pprint(results)
-
-        if satisfied:
-            print("SUCCESS")
-        else:
-            print("FAIL")
-
-        if saved is not None:
-            with open(saved, 'w') as f:
-                json.dump(results, f)
-
-        if with_exit_code:
-            if satisfied:
-                exit(0)
-            else:
-                exit(1)
-
-    def _run_script(self, command: List, capture_output_method: str='tempfile'):
-        return get_output(command, capture_output_method=capture_output_method)
-
-    def _get_script_path(self, run_script: List[str]):
-        for i, field in enumerate(run_script):
-            if field.endswith('.py') or field.endswith('.sh'):
-                return field
-
-        raise RuntimeError("Not found the name of script, " +
-                           "only support python or `sh` script, but got {}.".format(run_script))
-
-    def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str):
-        gpu_type = get_gpu_type().lower()
-
-        # Get the platform of trained log
-        if gpu_type == "nv":
-            gpu_type = 'bi'
-        else:
-            gpu_type = 'nv'
-
-        script_path = get_full_path(script_path)
-        project_dir = self._get_project_dir(script_path)
-        script_name = ospath.basename(script_path)
-
-        log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log"
-
-        return log_path
-
-
-    def _get_epoch(self, run_script: List[str]):
-        for i, field in enumerate(run_script):
-            if "--epoch" in field:
-                if "=" in field:
-                    return field.split("=")[1]
-                else:
-                    return run_script[i + 1]
-
-        return 'default'
-
-    def _get_dist_tag(self, script_name: str):
-        try:
-            import torch
-            num_gpus = torch.cuda.device_count()
-        except:
-            num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all")
-
-        if '_dist_' in script_name or '_multigpu_' in script_name:
-            return f".{num_gpus}card"
-        return ""
-
-    def _get_project_dir(self, abs_path):
-        abs_path = ospath.abspath(abs_path)
-        script_dir = ospath.dirname(abs_path)
-        executables_dir = ospath.dirname(script_dir)
-        return ospath.dirname(executables_dir)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
deleted file mode 100644
index 9da2c0cd579a3407b6d743bfd2a4cdbbd28a687c..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from typing import List, Mapping, Union, Tuple
-from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
-
-LogLines = List[Mapping]
-CompareResult = Tuple[bool, Union[List, Mapping]]
-
-
-def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult:
-    if not isinstance(threshold, Mapping):
-        _thds = dict()
-        for key in value1.keys():
-            _thds[key] = threshold
-        threshold = _thds
-
-    result = dict()
-    satisfied = True
-    for key, _thd in threshold.items():
-        v1, v2 = value1[key], value2[key]
-        origin_value_type = list
-        if not isinstance(v1, (tuple, list)):
-            origin_value_type = float
-            v1 = [v1]
-            v2 = [v2]
-
-        real_errors = []
-        for v1_i, v2_i in zip(v1, v2):
-            real_error = v1_i - v2_i
-            real_errors.append(real_error)
-            if satisfied and abs(real_error) > _thd:
-                if allow_greater_than and real_error > 0:
-                    continue
-                satisfied = False
-
-        if origin_value_type is float and len(real_errors) > 0:
-            real_errors = real_errors[0]
-
-        result[key] = real_errors
-
-    return satisfied, result
-
-
-def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
-    total_lines = len(log1[0])
-    real_errors = []
-    satisfied = True
-    for line_idx in range(total_lines):
-        _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than)
-        real_errors.append(_error)
-        if satisfied and not _satisfied:
-            satisfied = False
-
-    return satisfied, real_errors
-
-
-def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
-    if len(log1) == 0 or len(log2) == 0:
-        return False, []
-    return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than)
-
-
-def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping],
-                            patterns: List[str],
-                            pattern_names: List[str] = None,
-                            use_re: bool = False,
-                            nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS,
-                            start_line_pattern_flag: str = None,
-                            end_line_pattern_flag: str = None,
-                            only_last: bool=True,
-                            split_pattern: Union[str, List] = None,
-                            split_sep: List = None,
-                            split_idx: List = None,
-                            allow_greater_than: bool = False):
-    parser = LogParser(
-        patterns=patterns, pattern_names=pattern_names,
-        use_re=use_re, nearest_distance=nearest_distance,
-        start_line_pattern_flag=start_line_pattern_flag,
-        end_line_pattern_flag=end_line_pattern_flag,
-        split_pattern=split_pattern,
-        split_sep=split_sep,
-        split_idx=split_idx
-    )
-
-    log1 = parser.parse(log1)
-    log2 = parser.parse(log2)
-
-    if only_last:
-        compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than)
-    else:
-        compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than)
-
-    return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
deleted file mode 100644
index 3c690d8f677b3ae470322e29c266e84993a74266..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from typing import List, Optional, Union, Mapping
-import re
-import sys
-
-
-DEFAULT_NEAREST_MATCH_CHARS = 10
-
-
-def read_file(file):
-    with open(file, 'r') as f:
-        return f.readlines()
-
-def read_pipe():
-    result = []
-    for line in sys.stdin:
-        result.append(line)
-    return result
-
-def postprocess_search_result(results: List[str]) -> List[float]:
-    if len(results) != 0:
-        results = list(map(float, results))
-    return results
-
-
-def extract_nearest_value_by_key_inline(content: str, key: str,
-                                        nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
-    pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance)
-    return extract_value_by_pattern_inline(content, pattern)
-
-
-def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]:
-    results = re.findall(pattern, content)
-    return postprocess_search_result(results)
-
-
-def extract_value(content: str, pattern: str,
-                  inline=True, use_re=False,
-                  nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
-    if inline:
-        if use_re:
-            return extract_value_by_pattern_inline(content, pattern)
-        else:
-            return extract_nearest_value_by_key_inline(content, pattern, nearest_distance)
-    else:
-        raise NotImplementedError()
-
-
-class LogParser:
-
-    def __init__(self,
-                 patterns: List[str]=None,
-                 pattern_names: List[str]=None,
-                 use_re: bool=False,
-                 nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS,
-                 start_line_pattern_flag: str=None,
-                 end_line_pattern_flag: str=None,
-                 split_pattern: Union[str, List]=None,
-                 split_sep: List[str]=None,
-                 split_idx: List[int]=None):
-        if patterns is None and split_sep is None:
-            raise ValueError("The one of argument `patterns` or `split_sep` must be given.")
-
-        if pattern_names is not None:
-            if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names):
-                raise ValueError("The length of `pattern_names` argument not equal to `patterns`.")
-            if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names):
-                raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.")
-
-        if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))):
-            raise ValueError("Invalid index to split text, got {}.".format(split_idx))
-
-        if split_sep is not None and split_pattern is None:
-            raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern))
-
-        self.patterns = patterns
-        self.use_re = use_re
-        self.nearest_distance = nearest_distance
-        self.start_line_pattern_flag = start_line_pattern_flag
-        self.end_line_pattern_flag = end_line_pattern_flag
-
-        if not isinstance(split_sep, (tuple, list)) and split_sep is not None:
-            split_sep = [split_sep]
-
-            if not isinstance(split_idx, (tuple, list)):
-                split_idx = [split_idx]
-
-        self.split_sep = split_sep
-        self.split_idx = split_idx
-
-        if pattern_names is None:
-            if patterns is None:
-                pattern_names = split_idx
-            else:
-                pattern_names = patterns
-        self.pattern_names = pattern_names
-
-        if not isinstance(split_pattern, (tuple, list)) and split_sep is not None:
-            split_pattern = [split_pattern] * len(split_sep)
-        self.split_pattern = split_pattern
-
-        self.start_record = start_line_pattern_flag is None
-
-    def parse(self, path_or_logs: Union[str, List]) -> List[dict]:
-        """
-        : return: [{matric_name: value}, ...]
-        """
-
-        
-        if path_or_logs:
-            path_or_logs = read_file(path_or_logs)
-        else:
-            path_or_logs = read_pipe()
-
-        ret = []
-        for line in path_or_logs:
-            result = self.parse_inline(line)
-            if len(result) == 0:
-                continue
-            ret.append(result)
-        return ret
-
-    def parse_inline(self, line) -> dict:
-        if not self.can_record(line):
-            return {}
-
-        if self.split_sep is None:
-            return self._parse_inline_by_match(line)
-        return self._parse_inline_by_split(line)
-
-    def _parse_inline_by_match(self, line: str):
-        ret = {}
-        for name, pattern in zip(self.pattern_names, self.patterns):
-            result = extract_value(
-                line, pattern, inline=True, use_re=self.use_re,
-                nearest_distance=self.nearest_distance
-            )
-            if len(result) == 0:
-                continue
-            ret[name] = result
-        return ret
-
-    def _parse_inline_by_split(self, line: str, to_type=float):
-        ret = {}
-        for name, sep, idx, pattern in zip(self.pattern_names,
-                                  self.split_sep,
-                                  self.split_idx,
-                                  self.split_pattern):
-            if not self.can_matched(line, pattern):
-                continue
-            if '\t' in sep:
-                segs = line.strip().split(sep)
-            else:
-                segs = line.strip().replace('\t', ' ').split(sep)
-            segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs))
-            if len(segs) <= idx:
-                continue
-            ret[name] = to_type(segs[idx])
-        return ret
-
-    def can_record(self, line: str):
-        if self.start_line_pattern_flag is None:
-            self.start_record = True
-        elif not self.start_record:
-            self.start_record = self.can_matched(line, self.start_line_pattern_flag)
-
-        if self.start_record:
-            if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag):
-                self.start_record = False
-
-        return self.start_record
-
-    def can_matched(self, content: str, pattern: str):
-        result = re.findall(pattern, content)
-        return len(result) != 0
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
deleted file mode 100644
index ab7c60d3a6f0758bdac30b12fe82c83dab6cd520..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os.path as ospath
-
-from typing import NamedTuple, Union, List, Mapping
-
-from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS
-
-
-class LogComparatorArgs(NamedTuple):
-    threshold: Union[float, Mapping]
-    patterns: List[str] = None
-    pattern_names: List[str] = None
-    use_re: bool = False
-    nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS
-    start_line_pattern_flag: str = None
-    end_line_pattern_flag: str = None
-    split_pattern: Union[str, List] = None
-    split_sep: List = None
-    split_idx: List = None
-    only_last: bool = True
-    allow_greater_than: bool = True
-
-    def to_dict(self):
-        return self._asdict()
-
-
-class ArgsModelsTuple(NamedTuple):
-
-    args: LogComparatorArgs
-    models: List[str]
-
-
-class BaseConfig:
-
-    def __getitem__(self, item):
-        return self.__class__.__dict__[item]
-
-    def __getattr__(self, item):
-        return self.__class__.__dict__[item]
-
-    def __iter__(self):
-        for attr, value in self.__class__.__dict__.items():
-            if isinstance(value, ArgsModelsTuple):
-                yield attr
-
-    def iter_items(self):
-        for attr, value in self.__class__.__dict__.items():
-            if isinstance(value, ArgsModelsTuple):
-                yield attr, value
-
-
-class _TFComparatorConfig(BaseConfig):
-
-    cnn_benchmarks = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="],
-            pattern_names=["Acc@1", "Acc@5"]
-        ),
-        models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"]
-    )
-
-    dist_cnn_becnmarks = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            split_sep=[' ', ' '],
-            split_idx=[9, 10],
-            split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter",
-            pattern_names=['Acc@1', 'Acc@5']
-        ),
-        models=[
-            "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist"
-        ]
-    )
-
-    bert = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=["eval_accuracy ="],
-            pattern_names=["Accuracy"]
-        ),
-        models=["bert"]
-    )
-
-    ssd = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=["acc="],
-            pattern_names=["Acc@1"]
-        ),
-        models=["ssd"]
-    )
-
-    yolov3 = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.8,
-            patterns=["mAP"]
-        ),
-        models=["yolov3"]
-    )
-
-    vnet = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=["background_dice", "anterior_dice", "posterior_dice"]
-        ),
-        models=["vnet"]
-    )
-
-
-class _TorchComparatorConfig(BaseConfig):
-    classification = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=8.0, patterns=['Acc@1', 'Acc@5'],
-            start_line_pattern_flag="Start training",
-        ),
-        models=[
-            'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2',
-            'vgg', 'resnet50_dali', 'resnext', 'densenet'
-        ]
-    )
-
-    detection = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.03,
-            patterns=[
-                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
-            ],
-            pattern_names=["mAP"],
-            start_line_pattern_flag="IoU metric: bbox",
-            end_line_pattern_flag="IoU metric: segm"
-        ),
-        models=[
-            'maskrcnn', 'retinanet', 'ssd'
-        ]
-    )
-
-    bert_cola = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=['mcc']
-        ),
-        models=['bert_cola']
-    )
-
-    bert_mrpc = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=['acc']
-        ),
-        models=['bert_mrpc']
-    )
-
-    bert_pretrain_apex = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=['eval_mlm_accaracy']
-        ),
-        models=['bert_pretrain_apex']
-    )
-
-    segmentation = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=8.0,
-            patterns=['mean IoU:'],
-            pattern_names=['mIoU']
-        ),
-        models=[
-            'deeplabv3', 'fcn'
-        ]
-    )
-
-    t5 = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=5.0,
-            split_pattern="eval_bleu[\s\S]*?=",
-            split_sep=["="],
-            split_idx=[1],
-            pattern_names=['EvalBleu']
-        ),
-        models=['t5']
-    )
-
-    yolov3 = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=["mAP"]
-        ),
-        models=['yolov3']
-    )
-
-    yolov5 = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            patterns=[
-                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
-            ],
-            pattern_names=["mAP"],
-        ),
-        models=['yolov5'],
-    )
-
-    yolov5s_coco128 = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
-            split_sep=[" ", " "],
-            split_idx=[5, 6],
-            pattern_names=["AP50", "mAP"]
-        ),
-        models=['yolov5s_coco128']
-    )
-    
-    centernet_resnet18 = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
-            split_sep=[" ", " "],
-            split_idx=[5, 6],
-            pattern_names=["AP50", "mAP"]
-        ),
-        models=['centernet_resnet18']
-    )
-    
-    fcos_resnet50_fpn = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.08,
-            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
-            split_sep=[" ", " "],
-            split_idx=[5, 6],
-            pattern_names=["AP50", "mAP"]
-        ),
-        models=['fcos_resnet50_fpn']
-    )
-
-    ocr_recognition = ArgsModelsTuple(
-        args=LogComparatorArgs(
-            threshold=0.5,  patterns=["0_word_acc"],
-        ),
-        models=[
-            "sar", "satrn"
-        ]
-    )
-
-
-
-class ComparatorConfig:
-
-    _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig())
-
-    @classmethod
-    def get_frameworks(cls) -> List:
-        return list(cls._configs.keys())
-
-    @classmethod
-    def get(cls, tf_or_torch, name, default=None):
-        for model_kind, comb in cls._configs[tf_or_torch].iter_items():
-            if name in comb.models:
-                return comb.args
-        if default is not None:
-            return default
-        raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch))
-
-    @classmethod
-    def find_config(cls, script_path: str) -> LogComparatorArgs:
-        tf_or_torch = script_path.split('.')[-2].split('_')[-1]
-
-        # Find by the name of script
-        script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0]
-        if script_name.startswith('train_'):
-            script_name = script_name.replace("train_", "", 1)
-        while script_name not in [None, "", "/", "\\"]:
-            try:
-                config = cls.get(tf_or_torch, script_name)
-                return config
-            except:
-                pass
-            script_name = script_name.rsplit('_', maxsplit=1)
-            if len(script_name) <= 1:
-                break
-            script_name = script_name[0]
-
-        # Find by the name of model's dir
-        model_dir_name = ospath.basename(ospath.dirname(script_path))
-        try:
-            config = cls.get(tf_or_torch, model_dir_name)
-            return config
-        except:
-            raise RuntimeError("Not found for", script_path)
-
-
-def get_compare_config_with_full_path(script_path: str, to_dict=True):
-    config = ComparatorConfig.find_config(script_path)
-    if to_dict:
-        return config.to_dict()
-    return config
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
deleted file mode 100644
index 35f7efa99b21179da30ce34f412fa3319ea1ba00..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from argparse import ArgumentParser
-from abc import abstractmethod
-
-
-class BaseCLI:
-
-    def __init__(self, parser=None, *args, **kwargs):
-        if parser is None:
-            self.parser = ArgumentParser(description=self.description ,*args, **kwargs)
-
-    def __call__(self):
-        self.run()
-
-    @property
-    def description(self):
-        return None
-
-    @abstractmethod
-    def command_name(self):
-        pass
-
-    def predefine_args(self):
-        pass
-
-    def parse_args(self, *args, **kwargs):
-        self.predefine_args()
-        return self.parser.parse_args(*args, **kwargs)
-
-    @abstractmethod
-    def run(self):
-        pass
-
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
deleted file mode 100644
index 97407f37bd9d8a4c5e0a68c760a561ec03a29f95..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-from collections import defaultdict
-import os.path as osp
-import subprocess
-import sys
-
-
-def get_envinfo():
-    import torch
-    env_info = {}
-    env_info['sys.platform'] = sys.platform
-    env_info['Python'] = sys.version.replace('\n', '')
-
-    cuda_available = torch.cuda.is_available()
-    env_info['CUDA available'] = cuda_available
-    if cuda_available:
-        from torch.utils.cpp_extension import CUDA_HOME
-        env_info['CUDA_HOME'] = CUDA_HOME
-        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
-            try:
-                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
-                nvcc = subprocess.check_output(
-                    f'"{nvcc}" -V | tail -n1', shell=True)
-                nvcc = nvcc.decode('utf-8').strip()
-            except subprocess.SubprocessError:
-                nvcc = 'Not Available'
-            env_info['NVCC'] = nvcc
-
-        devices = defaultdict(list)
-        for k in range(torch.cuda.device_count()):
-            devices[torch.cuda.get_device_name(k)].append(str(k))
-        for name, devids in devices.items():
-            env_info['GPU ' + ','.join(devids)] = name
-
-    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
-    gcc = gcc.decode('utf-8').strip()
-    env_info['GCC'] = gcc
-
-    env_info['PyTorch'] = torch.__version__
-
-    return env_info
-
-
-def get_gpu_type():
-    import torch
-    if "DEBUG_GPU_TYPE" in os.environ:
-        return os.environ["DEBUG_GPU_TYPE"]
-
-    if not torch.cuda.is_available():
-        return "BI"
-    dev_name = torch.cuda.get_device_name(0)
-    if 'IX BI' in dev_name or getattr(torch, "corex", False):
-        _type = "BI"
-    else:
-        _type = "NV"
-
-    return _type
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
deleted file mode 100644
index 7328dd737c2720d544027ad1822d3c2007656a8e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import sys
-import subprocess
-from enum import Enum
-
-__all__ = ["get_iluvatar_card_type", "IluvatarGPU"]
-
-class IluvatarGPU(Enum):
-    UNKNOWN = -1
-    MR50 = 0
-    MR100 = 1
-    BI150 = 2
-
-card_ixsmi_names = {
-        "BI150": IluvatarGPU.BI150,
-        "BI-V150": IluvatarGPU.BI150,
-        "MR100": IluvatarGPU.MR100,
-        "MR-V100": IluvatarGPU.MR100,
-        "MR50": IluvatarGPU.MR50,
-        "MR-V50": IluvatarGPU.MR50,
-}
-
-def get_iluvatar_card_type():
-    command = 'ixsmi -L | grep "GPU \{1,\}0"'
-    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if result.returncode == 0:
-        for key, value in card_ixsmi_names.items():
-            if key in result.stdout:
-                return value
-        else:
-            return IluvatarGPU.UNKNOWN
-    else:
-        return IluvatarGPU.UNKNOWN
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
deleted file mode 100644
index 29760001cab2d9a8cbeecc894e9e3344ad00d2b4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-
-from typing import Union, List, Dict, Any, Mapping
-from argparse import Namespace, ArgumentParser
-import json
-
-
-def _obj_to_dict(obj) -> Dict:
-    if isinstance(obj, Mapping):
-        return obj
-
-    try:
-        from absl import flags
-        if isinstance(obj, flags.FlagValues):
-            return obj.flag_values_dict()
-    except:
-        pass
-    if isinstance(obj, Namespace):
-        return obj.__dict__
-    elif isinstance(obj, List):
-        new_obj = dict()
-        for _o in obj:
-            _o_dict = _obj_to_dict(_o)
-            new_obj.update(_o_dict)
-        return new_obj
-    elif not isinstance(obj, Dict):
-        if hasattr(obj, "__dict__"):
-            return obj.__dict__
-    try:
-        typename = type(obj).__name__
-    except:
-        typename = str(obj)
-    return {typename: str(obj)}
-
-
-def json_dump_obj(o):
-    if hasattr(o, "__name__"):
-        return o.__name__
-    return str(o)
-
-
-def show_infer_arguments(args: Union[List, Dict, Any]):
-    """ print running arguments
-    Example 1: For ArgumentParser
-        >>> parser = ArgumentParser("Test")
-        >>> parser.add_argument("--arg0", type=str)
-        >>> args = parser.parse_args()
-        >>> show_infer_arguments(args)
-
-    Example 2: For dict
-        >>> args = dict(arg=1)
-        >>> show_infer_arguments(args)
-
-    Example 3: For custom object
-        >>> from collections import namedtuple
-        >>> ArgsType = namedtuple("ArgsType", ["arg"])
-        >>> args = ArgsType(arg=123)
-        >>> show_infer_arguments(args)
-
-    Example 4: For absl
-        >>> from absl import flags
-        >>> flags.DEFINE_string("arg", "123", "test")
-        >>> show_infer_arguments(flags.FLAGS)
-
-    Example 5: For multi args
-        >>> args1 = dict(a=1)
-        >>> args2 = dict(b=2)
-        >>> show_infer_arguments([args1, args2])
-
-    """
-    if not "SHOW_RUNNING_ARGS" in os.environ:
-        return
-
-    if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]:
-        return
-
-    if "LOCAL_RANK" in os.environ:
-        if os.environ["LOCAL_RANK"] != "0":
-            return
-    args = _obj_to_dict(args)
-    args = json.dumps(args, default=json_dump_obj)
-    print("[RunningArguments]", args)
-
-
-if __name__ == '__main__':
-    os.environ["SHOW_RUNNING_ARGS"] = "1"
-    show_infer_arguments([dict(a=1), dict(b=1), object()])
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
deleted file mode 100644
index 457bdb3ee2aab7d98faa5567856e8fa923589e0a..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import copy
-import os
-
-
-def get_full_path(fname):
-    pwd = os.getcwd()
-    if fname.startswith('/'):
-        return fname
-    return os.path.join(pwd, fname)
-
-
-def is_main_proc(rank):
-    return str(rank) in ["None", "-1", "0"]
-
-
-def main_proc_print(*args, **kwargs):
-    if "RANK" in os.environ:
-        if is_main_proc(os.environ["RANK"]):
-            print(*args, **kwargs)
-            return
-
-    if "LOCAL_RANK" in os.environ:
-        if is_main_proc(os.environ["LOCAL_RANK"]):
-            print(*args, **kwargs)
-            return
-
-    print(*args, **kwargs)
-
-
-def create_subproc_env():
-    env = copy.copy(os.environ)
-    env["USE_DLTEST"] = "1"
-    return env
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
deleted file mode 100644
index a9883213f4f44d8253986e91c64f4015c66d6ec4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import os.path as ospath
-from pathlib import Path
-import tempfile
-
-
-class TemporaryFile:
-
-    def __init__(self, with_open=False, mode='r'):
-        self.name = None
-        self.with_open = with_open
-        self.mode = mode
-
-        self.file = None
-
-    def create(self):
-        self.name = tempfile.mktemp()
-        file_path = Path(self.name)
-        file_path.touch()
-
-    def delete(self):
-        if self.name is not None and ospath.exists(self.name):
-            os.unlink(self.name)
-
-    def read(self):
-        self._check_file_status()
-        return self.file.read()
-
-    def readlines(self):
-        self._check_file_status()
-        return self.file.readlines()
-
-    def _check_file_status(self):
-        if self.file is None:
-            raise RuntimeError("File is closed, please reopen it.")
-
-    def __enter__(self):
-        self.create()
-        if self.with_open:
-            self.file = open(self.name, mode=self.mode)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.with_open:
-            self.file.close()
-        self.delete()
-
-
-
-
-
-
-
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
deleted file mode 100644
index 8c5de879b0470d29e208368f1681df8469dcf488..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import subprocess
-from typing import Callable, Union, List
-
-from dltest.utils.real_tempfile import TemporaryFile
-from dltest.utils import misc
-
-
-def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs):
-    if shell is None:
-        shell = True
-
-    if shell and not isinstance(command, str):
-        command = " ".join(command)
-
-    stream = subprocess.Popen(
-        command, shell=shell,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        *args, **kwargs
-    )
-    outputs = []
-    while 1:
-        exit_code = stream.poll()
-        if exit_code is None:
-            if stream.stdout.readable():
-                outputs.append(stream.stdout.readline().decode("utf8").rstrip())
-                if callback is not None:
-                    callback(outputs[-1:])
-                print(outputs[-1])
-        else:
-            if stream.stdout.readable():
-                lines = stream.stdout.readlines()
-                lines = [line.decode("utf8".rstrip()) for line in lines]
-                outputs.extend(lines)
-                if callback is not None:
-                    callback(outputs[-1:])
-                print('\n'.join(lines))
-            break
-
-    return outputs
-
-
-def get_output_with_tempfile(command, *args, **kwargs):
-    if not isinstance(command, (list, tuple)):
-        command = [command]
-    stdout = None
-    with TemporaryFile(with_open=True) as file:
-        command.extend(['|', 'tee', file.name])
-        command = " ".join(command)
-
-        res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs)
-        output = file.readlines()
-
-    return output
-
-def execute_shell(command, *args, **kwargs):
-    if "env" not in kwargs:
-        kwargs["env"] = misc.create_subproc_env()
-
-    if not isinstance(command, (list, tuple)):
-        command = [command]
-
-    command = " ".join(command)
-    res = subprocess.run(command,
-                         shell=True, *args, **kwargs)
-    return res
-
-def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs):
-    if "env" not in kwargs:
-        kwargs["env"] = misc.create_subproc_env()
-
-    if capture_output_method == "tempfile":
-        return get_output_with_tempfile(command, *args, **kwargs)
-    return get_output_with_pipe(command, *args, **kwargs)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
deleted file mode 100644
index 2e4fa4eea09fa2cdf51b02619d56fe5fcced869f..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from setuptools import setup, find_packages
-from dltest.cli.entry_points import make_execute_path
-
-setup(
-    name="dltest",
-    version="0.1",
-    description='Iluvatar Corex AI Toolbox',
-    packages=find_packages(exclude=('examples')),
-    include_package_data=True,
-    zip_safe=False,
-    entry_points = {
-        'console_scripts': make_execute_path(),
-    },
-    install_requires=[
-        'psutil'
-    ]
-)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py
deleted file mode 100644
index 3a9c0ca081a1b44c00b0909c2b69c0e5a00c1e6a..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_PVT.py
+++ /dev/null
@@ -1,593 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
-    FusionCustomFC,
-    FusionCustomFCActivation,
-    FusionCustomFCGPT2,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
-    FusionFormatInvalidMask,
-    FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
-    FusionBiasSkipLayerNormalization,
-    FusionSkipLayerNormalization,
-)
-
-from passes.fusion_utils import FusionUtils
-
-from passes.fusion_conv_reformat import FusionConvReformat
-
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.fusion_PVT_attention import FusionPVTAttention
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class PVTOptimizationOptions(FusionOptions):
-    """This class is deprecated"""
-
-    def __init__(self, model_type):
-        logger.warning(
-            f"PVTOptimizationOptions is depreciated. Please use FusionOptions instead."
-        )
-        super().__init__(model_type)
-
-
-class PVTOnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
-        """Initialize BERT ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-
-        super().__init__(model)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-
-        self.attention_mask = AttentionMask(self)
-        self.attention_fusion = FusionAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.qordered_attention_fusion = FusionQOrderedAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.utils = FusionUtils(self)
-
-    def fuse_attention(self):
-        self.attention_fusion.apply()
-        FusionAlbertAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        ).apply()
-        # FusionVideoBertAttention(self).apply()
-        # FusionVITAttention(self).apply()
-        # FusionSwinLAttention(self).apply()
-        # FusionGptAttentionNoPast(self).apply()
-        FusionPVTAttention(self).apply()
-        # Only relevant in models with Q-DQ nodes
-        self.qordered_attention_fusion.apply()
-
-    def fuse_format_roformer(self):
-        FusionRemoveUselessElementwise(self).apply()
-        fusion = FusionFormatInvalidMask(self)
-        fusion.apply()
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_activation(self):
-        fusion = FusionCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_custom_fc_gpt2_classify(self):
-        fusion = FusionCustomFCGPT2(self)
-        fusion.apply()
-
-    def fuse_swinT_serial_bias_add(self):
-        fusion = FusionSerialBiasAdd(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_bias_gelu(self, is_fastgelu):
-        fusion = FusionBiasGelu(self, is_fastgelu)
-        fusion.apply()
-
-    def fuse_custom_xsoftmax(self):
-        fusion = FusionXSoftmax(self)
-        fusion.apply()
-
-    def fuse_disentangled_attention(self):
-        fusion = FusionDisentangledAttention(self)
-        fusion.apply()
-
-    def gelu_approximation(self):
-        fusion = FusionGeluApproximation(self)
-        fusion.apply()
-
-    def fuse_add_bias_skip_layer_norm(self):
-        fusion = FusionBiasSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_embed_layer(self):
-        fusion = FusionEmbedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self, self.hidden_size)
-        fusion.apply()
-
-        fusion = FusionLayerNormalizationTF(self)
-        fusion.apply()
-
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_skip_layer_norm(self):
-        fusion = FusionSkipLayerNormalization(self)
-        fusion.apply()
-
-    # Only relevant in models with Q-DQ nodes
-    def fuse_qordered_mamtul(self):
-        fusion = FusionQOrderedMatMul(self)
-        fusion.apply()
-        
-    def conv_reformat(self):
-        fusion = FusionConvReformat(self)
-        fusion.apply()    
-        
-        
-
-    def get_graph_inputs_from_node_type(
-        self, op_type: str, input_indices: List[int], casted: bool
-    ):
-        """
-        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
-        Returns a list of the graph input names based on the filter whether it is casted or not.
-        """
-        graph_inputs = []
-
-        output_name_to_node = self.output_name_to_node()
-        nodes = self.get_nodes_by_op_type(op_type)
-        for node in nodes:
-            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
-            for bert_input in bert_inputs:
-                if self.find_graph_input(bert_input):
-                    if not casted:
-                        graph_inputs.append(bert_input)
-                elif bert_input in output_name_to_node:
-                    parent = output_name_to_node[bert_input]
-                    if (
-                        parent.op_type == "Cast"
-                        and self.find_graph_input(parent.input[0]) is not None
-                    ):
-                        if casted:
-                            graph_inputs.append(parent.input[0])
-        return graph_inputs
-
-    def get_graph_inputs_from_fused_nodes(self, casted: bool):
-        inputs = self.get_graph_inputs_from_node_type(
-            "EmbedLayerNormalization", [0, 1, 7], casted
-        )
-        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
-        return inputs
-
-    def change_graph_input_type(
-        self,
-        graph: GraphProto,
-        graph_input: ValueInfoProto,
-        new_type: int = TensorProto.INT32,
-    ):
-        """Change graph input type, and add Cast node if needed.
-
-        Args:
-            graph (GraphProto): graph
-            graph_input (TensorProto): input of the graph
-            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
-        Returns:
-            NodeProto: a new Cast node that added. None if Cast node is not added.
-            List[NodeProto]: Cast nodes that have been removed.
-        """
-        assert isinstance(graph, GraphProto)
-        assert isinstance(graph_input, ValueInfoProto)
-        assert self.find_graph_input(graph_input.name)
-
-        if graph_input.type.tensor_type.elem_type == int(new_type):
-            return None, []
-
-        new_cast_node = None
-        nodes_to_remove = []
-
-        input_name_to_nodes = self.input_name_to_nodes()
-        if graph_input.name in input_name_to_nodes:
-            nodes = input_name_to_nodes[graph_input.name]
-
-            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
-            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
-            if nodes_not_cast:
-                node_name = self.create_node_name("Cast")
-                output_name = node_name + "_" + graph_input.name
-                new_value_info = graph.value_info.add()
-                new_value_info.CopyFrom(graph_input)
-                new_value_info.name = output_name
-                new_cast_node = helper.make_node(
-                    "Cast",
-                    [graph_input.name],
-                    [output_name],
-                    to=int(graph_input.type.tensor_type.elem_type),
-                    name=node_name,
-                )
-                graph.node.extend([new_cast_node])
-
-                for node in nodes_not_cast:
-                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
-            # For children that is Cast node, no need to insert Cast.
-            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
-            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
-            for node in nodes_cast:
-                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
-                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
-                if not self.find_graph_output(node.output[0]):
-                    nodes_to_remove.append(node)
-            if nodes_to_remove:
-                self.remove_nodes(nodes_to_remove)
-
-        graph_input.type.tensor_type.elem_type = int(new_type)
-        return new_cast_node, nodes_to_remove
-
-    def change_graph_inputs_to_int32(self):
-        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
-        graph = self.graph()
-        add_cast_count = 0
-        remove_cast_count = 0
-        for graph_input in graph.input:
-            new_node, removed_nodes = self.change_graph_input_type(
-                graph, graph_input, TensorProto.INT32
-            )
-            if new_node:
-                add_cast_count += 1
-            remove_cast_count += len(removed_nodes)
-        logger.info(
-            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
-        )
-
-    def use_dynamic_axes(
-        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
-    ):
-        """
-        Update input and output shape to use dynamic axes.
-        """
-        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
-            casted=True
-        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
-        dynamic_batch_inputs = {}
-        for input in self.model.graph.input:
-            if input.name in bert_graph_inputs:
-                dim_proto = input.type.tensor_type.shape.dim[0]
-                dim_proto.dim_param = dynamic_batch_dim
-                if dynamic_seq_len is not None:
-                    dim_proto = input.type.tensor_type.shape.dim[1]
-                    dim_proto.dim_param = dynamic_seq_len
-
-        for output in self.model.graph.output:
-            dim_proto = output.type.tensor_type.shape.dim[0]
-            dim_proto.dim_param = dynamic_batch_dim
-
-    def preprocess(self):
-        self.adjust_reshape_and_expand()
-        return
-
-    def adjust_reshape_and_expand(self):
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Reshape":
-                # Clean up unneccessary reshape nodes.
-                # Find reshape nodes with no actually data in "shape" attribute and remove.
-                reshape_shape = self.get_constant_value(node.input[1])
-                if reshape_shape is not None and reshape_shape.size == 0:
-                    nodes_to_remove.extend([node])
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    continue
-
-                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
-                # changing current reshape's input to output of slice.
-                reshape_path = self.match_parent_path(
-                    node,
-                    ["Expand", "Expand", "Reshape", "Slice"],
-                    [0, 0, 0, 0],
-                    self.output_name_to_node(),
-                )
-                if reshape_path is not None:
-                    expand_node = reshape_path[-3]
-                    expand_shape_value = self.get_constant_value(expand_node.input[1])
-
-                    reshape_before_expand = reshape_path[-2]
-                    shape_value = self.get_constant_value(
-                        reshape_before_expand.input[1]
-                    )
-
-                    slice_node = reshape_path[-1]
-                    if (
-                        expand_shape_value is not None
-                        and shape_value is not None
-                        and len(expand_shape_value) == 2
-                        and len(shape_value) == 1
-                        and expand_shape_value[1] == shape_value[0]
-                    ):
-                        node.input[0] = slice_node.output[0]
-
-        if nodes_to_remove:
-            self.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
-    def clean_graph(self):
-        output_name_to_node = self.output_name_to_node()
-        nodes_to_remove = []
-        for node in self.nodes():
-            # Before:
-            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
-            #          |                                                     |
-            #          |                                                     v
-            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # After:
-            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
-            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
-            if node.op_type in op_input_id:
-                i = op_input_id[node.op_type]
-                parent_nodes = self.match_parent_path(
-                    node,
-                    [
-                        "Cast",
-                        "ConstantOfShape",
-                        "Concat",
-                        "Unsqueeze",
-                        "Gather",
-                        "Shape",
-                    ],
-                    [i, 0, 0, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    (
-                        cast,
-                        constantOfShape,
-                        concat,
-                        unsqueeze,
-                        gather,
-                        shape,
-                    ) = parent_nodes
-                    if shape.input[0] == self.graph().input[0].name:
-                        constantOfShape.input[0] = shape.output[0]
-                        output_name_to_node = self.output_name_to_node()
-
-            if node.op_type == "Attention":
-                # Before:
-                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
-                # After:
-                #   remove this path, and remove the optional mask_index input of Attention node.
-                parent_nodes = self.match_parent_path(
-                    node,
-                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
-                    [3, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
-                        attention_node = helper.make_node(
-                            "Attention",
-                            inputs=node.input[0 : len(node.input) - 1],
-                            outputs=node.output,
-                            name=node.name + "_remove_mask",
-                        )
-                        attention_node.domain = "com.microsoft"
-                        attention_node.attribute.extend(
-                            [helper.make_attribute("num_heads", self.num_heads)]
-                        )
-                        self.add_node(
-                            attention_node, self.get_graph_by_node(attention_node).name
-                        )
-                        nodes_to_remove.append(node)
-        self.remove_nodes(nodes_to_remove)
-
-    def postprocess(self):
-        self.clean_graph()
-        self.prune_graph()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.preprocess()
-
-        self.fuse_reshape()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        if options.enable_swint_opt:
-            self.fuse_custom_fc()
-            self.fuse_swinT_serial_bias_add()
-
-        if options.enable_format_roformer:
-            self.fuse_format_roformer()
-
-        if options.enable_gpt2_classify or options.enable_vit:
-            self.fuse_custom_fc_gpt2_classify()
-
-        if options.enable_vit:
-            self.fuse_custom_fc()
-
-        # if (options is None) or options.enable_attention:
-        #     if options is not None:
-        #         self.attention_mask.set_mask_format(options.attention_mask_format)
-        self.fuse_attention()
-        
-        self.conv_reformat()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        self.fuse_custom_fc()
-
-        self.fuse_custom_xsoftmax()
-
-        self.fuse_disentangled_attention()
-
-        # Perform the MatMul fusion after the Attention fusion as we do not
-        # want to fuse the MatMuls inside the Attention subgraphs
-        if (options is None) or options.enable_qordered_matmul:
-            self.fuse_qordered_mamtul()
-
-        self.fuse_shape()
-
-        if (options is None) or options.enable_embed_layer_norm:
-            self.fuse_embed_layer()
-
-        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
-        self.utils.remove_useless_reshape_nodes()
-
-        self.postprocess()
-
-        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
-        if (options is None) or options.enable_bias_gelu:
-            # Fuse Gelu and Add Bias before it.
-            self.fuse_bias_gelu(is_fastgelu=True)
-            self.fuse_bias_gelu(is_fastgelu=False)
-
-        if (options is None) or options.enable_bias_skip_layer_norm:
-            # Fuse SkipLayerNormalization and Add Bias before it.
-            self.fuse_add_bias_skip_layer_norm()
-
-        if options is not None and options.enable_gelu_approximation:
-            self.gelu_approximation()
-
-        self.fuse_custom_fc_activation()
-
-        self.remove_unused_constant()
-
-        # Use symbolic batch dimension in input and output.
-        if add_dynamic_axes:
-            self.use_dynamic_axes()
-
-        logger.info(f"opset version: {self.get_opset_version()}")
-
-    def get_fused_operator_statistics(self):
-        """
-        Returns node count of fused operators.
-        """
-        op_count = {}
-        ops = [
-            "EmbedLayerNormalization",
-            "Attention",
-            "QOrderedAttention",
-            "Gelu",
-            "QOrderedGelu",
-            "FastGelu",
-            "BiasGelu",
-            "LayerNormalization",
-            "QOrderedLayerNormalization",
-            "SkipLayerNormalization",
-            "QOrderedMatMul",
-        ]
-        for op in ops:
-            nodes = self.get_nodes_by_op_type(op)
-            op_count[op] = len(nodes)
-        logger.info(f"Optimized operators:{op_count}")
-        return op_count
-
-    def is_fully_optimized(self):
-        """
-        Returns True when the model is fully optimized.
-        """
-        op_count = self.get_fused_operator_statistics()
-        embed = op_count["EmbedLayerNormalization"]
-        attention = op_count["Attention"] + op_count["QOrderedAttention"]
-        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
-        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (
-            (embed > 0)
-            and (attention > 0)
-            and (attention == gelu)
-            and (layer_norm >= 2 * attention)
-        )
-
-        if layer_norm == 0:
-            logger.debug("Layer Normalization not fused")
-
-        if gelu == 0:
-            logger.debug("Gelu/FastGelu not fused")
-
-        if embed == 0:
-            logger.debug("Embed Layer not fused")
-
-        if attention == 0:
-            logger.warning("Attention not fused")
-
-        return is_perfect
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
deleted file mode 100644
index 7324603e61bb7a13a57e586827c8fa67a9af4ae2..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
-    FusionCustomFC,
-    FusionCustomFCActivation,
-    FusionCustomFCGPT2,
-    FusionTorchvisionVitCustomFC,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
-    FusionFormatInvalidMask,
-    FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
-    FusionBiasSkipLayerNormalization,
-    FusionSkipLayerNormalization,
-)
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_videobert_attention import FusionVideoBertAttention
-from passes.fusion_vit_attention import FusionVITAttention, FusionTorchvisionVITAttention
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.fuse_inverse_sigmoid import FusionLayerInverseSigmoid
-from passes.fuse_l2_normalization import FusionLayerL2Normalization
-from passes.fuse_omdet_attention import FusionLayerOmdetAttention
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class BertOptimizationOptions(FusionOptions):
-    """This class is deprecated"""
-
-    def __init__(self, model_type):
-        logger.warning(
-            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
-        )
-        super().__init__(model_type)
-
-
-class BertOnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
-        """Initialize BERT ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-
-        super().__init__(model)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-
-        self.attention_mask = AttentionMask(self)
-        self.attention_fusion = FusionAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.qordered_attention_fusion = FusionQOrderedAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.utils = FusionUtils(self)
-
-    def fuse_attention(self):
-        self.attention_fusion.apply()
-        FusionAlbertAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        ).apply()
-        FusionVideoBertAttention(self).apply()
-        FusionVITAttention(self).apply()
-        FusionTorchvisionVITAttention(self).apply()
-        FusionSwinLAttention(self).apply()
-        FusionGptAttentionNoPast(self).apply()
-        # Only relevant in models with Q-DQ nodes
-        self.qordered_attention_fusion.apply()
-
-    def fuse_format_roformer(self):
-        FusionRemoveUselessElementwise(self).apply()
-        fusion = FusionFormatInvalidMask(self)
-        fusion.apply()
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_torchvision_vit(self):
-        fusion = FusionTorchvisionVitCustomFC(self)
-        fusion.apply()
-    
-    def fuse_custom_fc_activation(self):
-        fusion = FusionCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_custom_fc_gpt2_classify(self):
-        fusion = FusionCustomFCGPT2(self)
-        fusion.apply()
-
-    def fuse_swinT_serial_bias_add(self):
-        fusion = FusionSerialBiasAdd(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_bias_gelu(self, is_fastgelu):
-        fusion = FusionBiasGelu(self, is_fastgelu)
-        fusion.apply()
-
-    def fuse_custom_xsoftmax(self):
-        fusion = FusionXSoftmax(self)
-        fusion.apply()
-
-    def fuse_disentangled_attention(self):
-        fusion = FusionDisentangledAttention(self)
-        fusion.apply()
-
-    def gelu_approximation(self):
-        fusion = FusionGeluApproximation(self)
-        fusion.apply()
-
-    def fuse_add_bias_skip_layer_norm(self):
-        fusion = FusionBiasSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_embed_layer(self):
-        fusion = FusionEmbedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self, self.hidden_size)
-        fusion.apply()
-
-        fusion = FusionLayerNormalizationTF(self)
-        fusion.apply()
-
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_skip_layer_norm(self):
-        fusion = FusionSkipLayerNormalization(self)
-        fusion.apply()
-
-    # Only relevant in models with Q-DQ nodes
-    def fuse_qordered_mamtul(self):
-        fusion = FusionQOrderedMatMul(self)
-        fusion.apply()
-
-    def fuse_omdet_inverse_sigmoid(self):
-        fusion = FusionLayerInverseSigmoid(self)
-        fusion.apply()
-
-    def fuse_omdet_attention(self):
-        fusion = FusionLayerOmdetAttention(self)
-        fusion.apply()
-
-    def fuse_l2_normalization(self):
-        fusion = FusionLayerL2Normalization(self)
-        fusion.apply()
-
-    def get_graph_inputs_from_node_type(
-        self, op_type: str, input_indices: List[int], casted: bool
-    ):
-        """
-        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
-        Returns a list of the graph input names based on the filter whether it is casted or not.
-        """
-        graph_inputs = []
-
-        output_name_to_node = self.output_name_to_node()
-        nodes = self.get_nodes_by_op_type(op_type)
-        for node in nodes:
-            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
-            for bert_input in bert_inputs:
-                if self.find_graph_input(bert_input):
-                    if not casted:
-                        graph_inputs.append(bert_input)
-                elif bert_input in output_name_to_node:
-                    parent = output_name_to_node[bert_input]
-                    if (
-                        parent.op_type == "Cast"
-                        and self.find_graph_input(parent.input[0]) is not None
-                    ):
-                        if casted:
-                            graph_inputs.append(parent.input[0])
-        return graph_inputs
-
-    def get_graph_inputs_from_fused_nodes(self, casted: bool):
-        inputs = self.get_graph_inputs_from_node_type(
-            "EmbedLayerNormalization", [0, 1, 7], casted
-        )
-        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
-        return inputs
-
-    def change_graph_input_type(
-        self,
-        graph: GraphProto,
-        graph_input: ValueInfoProto,
-        new_type: int = TensorProto.INT32,
-    ):
-        """Change graph input type, and add Cast node if needed.
-
-        Args:
-            graph (GraphProto): graph
-            graph_input (TensorProto): input of the graph
-            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
-        Returns:
-            NodeProto: a new Cast node that added. None if Cast node is not added.
-            List[NodeProto]: Cast nodes that have been removed.
-        """
-        assert isinstance(graph, GraphProto)
-        assert isinstance(graph_input, ValueInfoProto)
-        assert self.find_graph_input(graph_input.name)
-
-        if graph_input.type.tensor_type.elem_type == int(new_type):
-            return None, []
-
-        new_cast_node = None
-        nodes_to_remove = []
-
-        input_name_to_nodes = self.input_name_to_nodes()
-        if graph_input.name in input_name_to_nodes:
-            nodes = input_name_to_nodes[graph_input.name]
-
-            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
-            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
-            if nodes_not_cast:
-                node_name = self.create_node_name("Cast")
-                output_name = node_name + "_" + graph_input.name
-                new_value_info = graph.value_info.add()
-                new_value_info.CopyFrom(graph_input)
-                new_value_info.name = output_name
-                new_cast_node = helper.make_node(
-                    "Cast",
-                    [graph_input.name],
-                    [output_name],
-                    to=int(graph_input.type.tensor_type.elem_type),
-                    name=node_name,
-                )
-                graph.node.extend([new_cast_node])
-
-                for node in nodes_not_cast:
-                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
-            # For children that is Cast node, no need to insert Cast.
-            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
-            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
-            for node in nodes_cast:
-                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
-                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
-                if not self.find_graph_output(node.output[0]):
-                    nodes_to_remove.append(node)
-            if nodes_to_remove:
-                self.remove_nodes(nodes_to_remove)
-
-        graph_input.type.tensor_type.elem_type = int(new_type)
-        return new_cast_node, nodes_to_remove
-
-    def change_graph_inputs_to_int32(self):
-        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
-        graph = self.graph()
-        add_cast_count = 0
-        remove_cast_count = 0
-        for graph_input in graph.input:
-            new_node, removed_nodes = self.change_graph_input_type(
-                graph, graph_input, TensorProto.INT32
-            )
-            if new_node:
-                add_cast_count += 1
-            remove_cast_count += len(removed_nodes)
-        logger.info(
-            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
-        )
-
-    def use_dynamic_axes(
-        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
-    ):
-        """
-        Update input and output shape to use dynamic axes.
-        """
-        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
-            casted=True
-        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
-        dynamic_batch_inputs = {}
-        for input in self.model.graph.input:
-            if input.name in bert_graph_inputs:
-                dim_proto = input.type.tensor_type.shape.dim[0]
-                dim_proto.dim_param = dynamic_batch_dim
-                if dynamic_seq_len is not None:
-                    dim_proto = input.type.tensor_type.shape.dim[1]
-                    dim_proto.dim_param = dynamic_seq_len
-
-        for output in self.model.graph.output:
-            dim_proto = output.type.tensor_type.shape.dim[0]
-            dim_proto.dim_param = dynamic_batch_dim
-
-    def preprocess(self):
-        self.adjust_reshape_and_expand()
-        return
-
-    def adjust_reshape_and_expand(self):
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Reshape":
-                # Clean up unneccessary reshape nodes.
-                # Find reshape nodes with no actually data in "shape" attribute and remove.
-                reshape_shape = self.get_constant_value(node.input[1])
-                if reshape_shape is not None and reshape_shape.size == 0:
-                    nodes_to_remove.extend([node])
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    continue
-
-                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
-                # changing current reshape's input to output of slice.
-                reshape_path = self.match_parent_path(
-                    node,
-                    ["Expand", "Expand", "Reshape", "Slice"],
-                    [0, 0, 0, 0],
-                    self.output_name_to_node(),
-                )
-                if reshape_path is not None:
-                    expand_node = reshape_path[-3]
-                    expand_shape_value = self.get_constant_value(expand_node.input[1])
-
-                    reshape_before_expand = reshape_path[-2]
-                    shape_value = self.get_constant_value(
-                        reshape_before_expand.input[1]
-                    )
-
-                    slice_node = reshape_path[-1]
-                    if (
-                        expand_shape_value is not None
-                        and shape_value is not None
-                        and len(expand_shape_value) == 2
-                        and len(shape_value) == 1
-                        and expand_shape_value[1] == shape_value[0]
-                    ):
-                        node.input[0] = slice_node.output[0]
-
-        if nodes_to_remove:
-            self.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
-    def clean_graph(self):
-        output_name_to_node = self.output_name_to_node()
-        nodes_to_remove = []
-        for node in self.nodes():
-            # Before:
-            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
-            #          |                                                     |
-            #          |                                                     v
-            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # After:
-            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
-            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
-            if node.op_type in op_input_id:
-                i = op_input_id[node.op_type]
-                parent_nodes = self.match_parent_path(
-                    node,
-                    [
-                        "Cast",
-                        "ConstantOfShape",
-                        "Concat",
-                        "Unsqueeze",
-                        "Gather",
-                        "Shape",
-                    ],
-                    [i, 0, 0, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    (
-                        cast,
-                        constantOfShape,
-                        concat,
-                        unsqueeze,
-                        gather,
-                        shape,
-                    ) = parent_nodes
-                    if shape.input[0] == self.graph().input[0].name:
-                        constantOfShape.input[0] = shape.output[0]
-                        output_name_to_node = self.output_name_to_node()
-
-            if node.op_type == "Attention":
-                # Before:
-                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
-                # After:
-                #   remove this path, and remove the optional mask_index input of Attention node.
-                parent_nodes = self.match_parent_path(
-                    node,
-                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
-                    [3, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
-                        attention_node = helper.make_node(
-                            "Attention",
-                            inputs=node.input[0 : len(node.input) - 1],
-                            outputs=node.output,
-                            name=node.name + "_remove_mask",
-                        )
-                        attention_node.domain = "com.microsoft"
-                        attention_node.attribute.extend(
-                            [helper.make_attribute("num_heads", self.num_heads)]
-                        )
-                        self.add_node(
-                            attention_node, self.get_graph_by_node(attention_node).name
-                        )
-                        nodes_to_remove.append(node)
-        self.remove_nodes(nodes_to_remove)
-
-    def postprocess(self):
-        self.clean_graph()
-        self.prune_graph()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.preprocess()
-
-        self.fuse_reshape()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        if options.enable_swint_opt:
-            self.fuse_custom_fc()
-            self.fuse_swinT_serial_bias_add()
-
-        if options.enable_format_roformer:
-            self.fuse_format_roformer()
-
-        if options.enable_gpt2_classify or options.enable_vit:
-            self.fuse_custom_fc_gpt2_classify()
-
-        if options.enable_vit:
-            self.fuse_custom_fc()
-
-        if (options is None) or options.enable_attention:
-            if options is not None:
-                self.attention_mask.set_mask_format(options.attention_mask_format)
-            self.fuse_attention()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        self.fuse_custom_fc()
-        
-        if options.enable_omdet:
-            self.fuse_omdet_attention()
-            self.fuse_omdet_inverse_sigmoid()
-            self.fuse_l2_normalization()
-
-        self.fuse_custom_xsoftmax()
-
-        self.fuse_disentangled_attention()
-
-        # Perform the MatMul fusion after the Attention fusion as we do not
-        # want to fuse the MatMuls inside the Attention subgraphs
-        if (options is None) or options.enable_qordered_matmul:
-            self.fuse_qordered_mamtul()
-
-        self.fuse_shape()
-
-        if (options is None) or options.enable_embed_layer_norm:
-            self.fuse_embed_layer()
-
-        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
-        self.utils.remove_useless_reshape_nodes()
-
-        self.postprocess()
-
-        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
-        if (options is None) or options.enable_bias_gelu:
-            # Fuse Gelu and Add Bias before it.
-            self.fuse_bias_gelu(is_fastgelu=True)
-            self.fuse_bias_gelu(is_fastgelu=False)
-
-        if (options is None) or options.enable_bias_skip_layer_norm:
-            # Fuse SkipLayerNormalization and Add Bias before it.
-            self.fuse_add_bias_skip_layer_norm()
-
-        if options is not None and options.enable_gelu_approximation:
-            self.gelu_approximation()
-
-        self.fuse_custom_fc_activation()
-        
-        if options.enable_vit:
-            self.fuse_custom_fc_torchvision_vit()
-
-        self.remove_unused_constant()
-
-        # Use symbolic batch dimension in input and output.
-        if add_dynamic_axes:
-            self.use_dynamic_axes()
-
-        logger.info(f"opset version: {self.get_opset_version()}")
-
-    def get_fused_operator_statistics(self):
-        """
-        Returns node count of fused operators.
-        """
-        op_count = {}
-        ops = [
-            "EmbedLayerNormalization",
-            "Attention",
-            "QOrderedAttention",
-            "Gelu",
-            "QOrderedGelu",
-            "FastGelu",
-            "BiasGelu",
-            "LayerNormalization",
-            "QOrderedLayerNormalization",
-            "SkipLayerNormalization",
-            "QOrderedMatMul",
-        ]
-        for op in ops:
-            nodes = self.get_nodes_by_op_type(op)
-            op_count[op] = len(nodes)
-        logger.info(f"Optimized operators:{op_count}")
-        return op_count
-
-    def is_fully_optimized(self):
-        """
-        Returns True when the model is fully optimized.
-        """
-        op_count = self.get_fused_operator_statistics()
-        embed = op_count["EmbedLayerNormalization"]
-        attention = op_count["Attention"] + op_count["QOrderedAttention"]
-        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
-        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (
-            (embed > 0)
-            and (attention > 0)
-            and (attention == gelu)
-            and (layer_norm >= 2 * attention)
-        )
-
-        if layer_norm == 0:
-            logger.debug("Layer Normalization not fused")
-
-        if gelu == 0:
-            logger.debug("Gelu/FastGelu not fused")
-
-        if embed == 0:
-            logger.debug("Embed Layer not fused")
-
-        if attention == 0:
-            logger.warning("Attention not fused")
-
-        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
deleted file mode 100644
index cc59c37bd48f677a7d06f141f45eaa55aef54656..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
+++ /dev/null
@@ -1,591 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_conformer_attention import FusionConformerAttention
-from passes.fusion_conformer_xsoftmax import FusionConformerXSoftmax
-from passes.fusion_customfc import (
-    FusionConformerCustomFCActivation,
-    FusionCustomFC,
-    FusionCustomFCGPT2,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
-    FusionFormatInvalidMask,
-    FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
-    FusionBiasSkipLayerNormalization,
-    FusionSkipLayerNormalization,
-)
-from passes.fusion_splitQKV import FusionSplitQKV
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_vit_attention import FusionVITAttention
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class ConformerOptimizationOptions(FusionOptions):
-    """This class is deprecated"""
-
-    def __init__(self, model_type):
-        logger.warning(
-            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
-        )
-        super().__init__(model_type)
-
-
-class conformerOnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
-        """Initialize BERT ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-
-        super().__init__(model)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-
-        self.attention_mask = AttentionMask(self)
-        self.attention_fusion = FusionAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.qordered_attention_fusion = FusionQOrderedAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.utils = FusionUtils(self)
-
-    def fuse_attention(self):
-        FusionConformerAttention(self, self.hidden_size, self.num_heads).apply()
-        # Only relevant in models with Q-DQ nodes
-        self.qordered_attention_fusion.apply()
-
-    def fuse_format_roformer(self):
-        FusionRemoveUselessElementwise(self).apply()
-        fusion = FusionFormatInvalidMask(self)
-        fusion.apply()
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_conformer_activation(self):
-        fusion = FusionConformerCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_custom_fc_gpt2_classify(self):
-        fusion = FusionCustomFCGPT2(self)
-        fusion.apply()
-
-    def fuse_swinT_serial_bias_add(self):
-        fusion = FusionSerialBiasAdd(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_bias_gelu(self, is_fastgelu):
-        fusion = FusionBiasGelu(self, is_fastgelu)
-        fusion.apply()
-
-    def fuse_custom_xsoftmax(self):
-        fusion = FusionConformerXSoftmax(self)
-        fusion.apply()
-
-    def fuse_disentangled_attention(self):
-        fusion = FusionDisentangledAttention(self)
-        fusion.apply()
-
-    def gelu_approximation(self):
-        fusion = FusionGeluApproximation(self)
-        fusion.apply()
-
-    def fuse_add_bias_skip_layer_norm(self):
-        fusion = FusionBiasSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_embed_layer(self):
-        fusion = FusionEmbedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self, self.hidden_size)
-        fusion.apply()
-
-        fusion = FusionLayerNormalizationTF(self)
-        fusion.apply()
-
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_skip_layer_norm(self):
-        fusion = FusionSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_split_qkv(self):
-        fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
-        fusion.apply()
-
-    # Only relevant in models with Q-DQ nodes
-    def fuse_qordered_mamtul(self):
-        fusion = FusionQOrderedMatMul(self)
-        fusion.apply()
-
-    def get_graph_inputs_from_node_type(
-        self, op_type: str, input_indices: List[int], casted: bool
-    ):
-        """
-        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
-        Returns a list of the graph input names based on the filter whether it is casted or not.
-        """
-        graph_inputs = []
-
-        output_name_to_node = self.output_name_to_node()
-        nodes = self.get_nodes_by_op_type(op_type)
-        for node in nodes:
-            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
-            for bert_input in bert_inputs:
-                if self.find_graph_input(bert_input):
-                    if not casted:
-                        graph_inputs.append(bert_input)
-                elif bert_input in output_name_to_node:
-                    parent = output_name_to_node[bert_input]
-                    if (
-                        parent.op_type == "Cast"
-                        and self.find_graph_input(parent.input[0]) is not None
-                    ):
-                        if casted:
-                            graph_inputs.append(parent.input[0])
-        return graph_inputs
-
-    def get_graph_inputs_from_fused_nodes(self, casted: bool):
-        inputs = self.get_graph_inputs_from_node_type(
-            "EmbedLayerNormalization", [0, 1, 7], casted
-        )
-        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
-        return inputs
-
-    def change_graph_input_type(
-        self,
-        graph: GraphProto,
-        graph_input: ValueInfoProto,
-        new_type: int = TensorProto.INT32,
-    ):
-        """Change graph input type, and add Cast node if needed.
-
-        Args:
-            graph (GraphProto): graph
-            graph_input (TensorProto): input of the graph
-            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
-        Returns:
-            NodeProto: a new Cast node that added. None if Cast node is not added.
-            List[NodeProto]: Cast nodes that have been removed.
-        """
-        assert isinstance(graph, GraphProto)
-        assert isinstance(graph_input, ValueInfoProto)
-        assert self.find_graph_input(graph_input.name)
-
-        if graph_input.type.tensor_type.elem_type == int(new_type):
-            return None, []
-
-        new_cast_node = None
-        nodes_to_remove = []
-
-        input_name_to_nodes = self.input_name_to_nodes()
-        if graph_input.name in input_name_to_nodes:
-            nodes = input_name_to_nodes[graph_input.name]
-
-            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
-            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
-            if nodes_not_cast:
-                node_name = self.create_node_name("Cast")
-                output_name = node_name + "_" + graph_input.name
-                new_value_info = graph.value_info.add()
-                new_value_info.CopyFrom(graph_input)
-                new_value_info.name = output_name
-                new_cast_node = helper.make_node(
-                    "Cast",
-                    [graph_input.name],
-                    [output_name],
-                    to=int(graph_input.type.tensor_type.elem_type),
-                    name=node_name,
-                )
-                graph.node.extend([new_cast_node])
-
-                for node in nodes_not_cast:
-                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
-            # For children that is Cast node, no need to insert Cast.
-            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
-            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
-            for node in nodes_cast:
-                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
-                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
-                if not self.find_graph_output(node.output[0]):
-                    nodes_to_remove.append(node)
-            if nodes_to_remove:
-                self.remove_nodes(nodes_to_remove)
-
-        graph_input.type.tensor_type.elem_type = int(new_type)
-        return new_cast_node, nodes_to_remove
-
-    def change_graph_inputs_to_int32(self):
-        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
-        graph = self.graph()
-        add_cast_count = 0
-        remove_cast_count = 0
-        for graph_input in graph.input:
-            new_node, removed_nodes = self.change_graph_input_type(
-                graph, graph_input, TensorProto.INT32
-            )
-            if new_node:
-                add_cast_count += 1
-            remove_cast_count += len(removed_nodes)
-        logger.info(
-            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
-        )
-
-    def use_dynamic_axes(
-        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
-    ):
-        """
-        Update input and output shape to use dynamic axes.
-        """
-        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
-            casted=True
-        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
-        dynamic_batch_inputs = {}
-        for input in self.model.graph.input:
-            if input.name in bert_graph_inputs:
-                dim_proto = input.type.tensor_type.shape.dim[0]
-                dim_proto.dim_param = dynamic_batch_dim
-                if dynamic_seq_len is not None:
-                    dim_proto = input.type.tensor_type.shape.dim[1]
-                    dim_proto.dim_param = dynamic_seq_len
-
-        for output in self.model.graph.output:
-            dim_proto = output.type.tensor_type.shape.dim[0]
-            dim_proto.dim_param = dynamic_batch_dim
-
-    def preprocess(self):
-        self.adjust_reshape_and_expand()
-        return
-
-    def adjust_reshape_and_expand(self):
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Reshape":
-                # Clean up unneccessary reshape nodes.
-                # Find reshape nodes with no actually data in "shape" attribute and remove.
-                reshape_shape = self.get_constant_value(node.input[1])
-                if reshape_shape is not None and reshape_shape.size == 0:
-                    nodes_to_remove.extend([node])
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    continue
-
-                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
-                # changing current reshape's input to output of slice.
-                reshape_path = self.match_parent_path(
-                    node,
-                    ["Expand", "Expand", "Reshape", "Slice"],
-                    [0, 0, 0, 0],
-                    self.output_name_to_node(),
-                )
-                if reshape_path is not None:
-                    expand_node = reshape_path[-3]
-                    expand_shape_value = self.get_constant_value(expand_node.input[1])
-
-                    reshape_before_expand = reshape_path[-2]
-                    shape_value = self.get_constant_value(
-                        reshape_before_expand.input[1]
-                    )
-
-                    slice_node = reshape_path[-1]
-                    if (
-                        expand_shape_value is not None
-                        and shape_value is not None
-                        and len(expand_shape_value) == 2
-                        and len(shape_value) == 1
-                        and expand_shape_value[1] == shape_value[0]
-                    ):
-                        node.input[0] = slice_node.output[0]
-
-        if nodes_to_remove:
-            self.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
-    def clean_graph(self):
-        output_name_to_node = self.output_name_to_node()
-        nodes_to_remove = []
-        for node in self.nodes():
-            # Before:
-            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
-            #          |                                                     |
-            #          |                                                     v
-            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # After:
-            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
-            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
-            if node.op_type in op_input_id:
-                i = op_input_id[node.op_type]
-                parent_nodes = self.match_parent_path(
-                    node,
-                    [
-                        "Cast",
-                        "ConstantOfShape",
-                        "Concat",
-                        "Unsqueeze",
-                        "Gather",
-                        "Shape",
-                    ],
-                    [i, 0, 0, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    (
-                        cast,
-                        constantOfShape,
-                        concat,
-                        unsqueeze,
-                        gather,
-                        shape,
-                    ) = parent_nodes
-                    if shape.input[0] == self.graph().input[0].name:
-                        constantOfShape.input[0] = shape.output[0]
-                        output_name_to_node = self.output_name_to_node()
-
-            if node.op_type == "Attention":
-                # Before:
-                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
-                # After:
-                #   remove this path, and remove the optional mask_index input of Attention node.
-                parent_nodes = self.match_parent_path(
-                    node,
-                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
-                    [3, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
-                        attention_node = helper.make_node(
-                            "Attention",
-                            inputs=node.input[0 : len(node.input) - 1],
-                            outputs=node.output,
-                            name=node.name + "_remove_mask",
-                        )
-                        attention_node.domain = "com.microsoft"
-                        attention_node.attribute.extend(
-                            [helper.make_attribute("num_heads", self.num_heads)]
-                        )
-                        self.add_node(
-                            attention_node, self.get_graph_by_node(attention_node).name
-                        )
-                        nodes_to_remove.append(node)
-        self.remove_nodes(nodes_to_remove)
-
-    def postprocess(self):
-        self.clean_graph()
-        self.prune_graph()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.preprocess()
-
-        self.fuse_reshape()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        if options.enable_swint_opt:
-            self.fuse_custom_fc()
-            self.fuse_swinT_serial_bias_add()
-
-        if options.enable_format_roformer:
-            self.fuse_format_roformer()
-
-        if options.enable_gpt2_classify or options.enable_vit:
-            self.fuse_custom_fc_gpt2_classify()
-
-        if options.enable_vit:
-            self.fuse_custom_fc()
-
-        self.fuse_custom_fc()
-        self.fuse_custom_xsoftmax()
-
-        self.fuse_attention()
-
-        self.fuse_split_qkv()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        # Perform the MatMul fusion after the Attention fusion as we do not
-        # want to fuse the MatMuls inside the Attention subgraphs
-        if (options is None) or options.enable_qordered_matmul:
-            self.fuse_qordered_mamtul()
-
-        self.fuse_shape()
-
-        if (options is None) or options.enable_embed_layer_norm:
-            self.fuse_embed_layer()
-
-        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
-        self.utils.remove_useless_reshape_nodes()
-
-        self.postprocess()
-
-        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
-        if (options is None) or options.enable_bias_gelu:
-            # Fuse Gelu and Add Bias before it.
-            self.fuse_bias_gelu(is_fastgelu=True)
-            self.fuse_bias_gelu(is_fastgelu=False)
-
-        if (options is None) or options.enable_bias_skip_layer_norm:
-            # Fuse SkipLayerNormalization and Add Bias before it.
-            self.fuse_add_bias_skip_layer_norm()
-
-        if options is not None and options.enable_gelu_approximation:
-            self.gelu_approximation()
-
-        self.remove_unused_constant()
-        self.fuse_custom_fc_conformer_activation()
-
-        # Use symbolic batch dimension in input and output.
-        if add_dynamic_axes:
-            self.use_dynamic_axes()
-
-        logger.info(f"opset version: {self.get_opset_version()}")
-
-    def get_fused_operator_statistics(self):
-        """
-        Returns node count of fused operators.
-        """
-        op_count = {}
-        ops = [
-            "EmbedLayerNormalization",
-            "Attention",
-            "QOrderedAttention",
-            "Gelu",
-            "QOrderedGelu",
-            "FastGelu",
-            "BiasGelu",
-            "LayerNormalization",
-            "QOrderedLayerNormalization",
-            "SkipLayerNormalization",
-            "QOrderedMatMul",
-        ]
-        for op in ops:
-            nodes = self.get_nodes_by_op_type(op)
-            op_count[op] = len(nodes)
-        logger.info(f"Optimized operators:{op_count}")
-        return op_count
-
-    def is_fully_optimized(self):
-        """
-        Returns True when the model is fully optimized.
-        """
-        op_count = self.get_fused_operator_statistics()
-        embed = op_count["EmbedLayerNormalization"]
-        attention = op_count["Attention"] + op_count["QOrderedAttention"]
-        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
-        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (
-            (embed > 0)
-            and (attention > 0)
-            and (attention == gelu)
-            and (layer_norm >= 2 * attention)
-        )
-
-        if layer_norm == 0:
-            logger.debug("Layer Normalization not fused")
-
-        if gelu == 0:
-            logger.debug("Gelu/FastGelu not fused")
-
-        if embed == 0:
-            logger.debug("Embed Layer not fused")
-
-        if attention == 0:
-            logger.warning("Attention not fused")
-
-        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py
deleted file mode 100755
index 98cfc6699ab5276f2fd37915a62487a173fb4d12..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_cosyvoice.py
+++ /dev/null
@@ -1,640 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
-    FusionCustomFC,
-    FusionCustomFCActivation,
-    FusionCustomFCGPT2,
-    FusionTorchvisionVitCustomFC,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
-    FusionFormatInvalidMask,
-    FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
-    FusionBiasSkipLayerNormalization,
-    FusionSkipLayerNormalization,
-)
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_videobert_attention import FusionVideoBertAttention
-from passes.fusion_vit_attention import FusionVITAttention, FusionTorchvisionVITAttention
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.fuse_inverse_sigmoid import FusionLayerInverseSigmoid
-from passes.fuse_l2_normalization import FusionLayerL2Normalization
-from passes.fuse_omdet_attention import FusionLayerOmdetAttention
-from passes.onnx_model import OnnxModel
-
-from passes.fusion_cosyvoice_splitQKV_update_KVcache import FusionCosyVoiceSplitQKVUpdateKVCache
-from passes.fusion_cosyvoice_attention import (
-    FusionCosyvoiceAttention
-)
-from passes.fusion_cosyvoice_splitQKV import FusionSplitQKV
-
-
-
-logger = getLogger(__name__)
-
-
-
-class cosyvoiceOnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads: int = 16, hidden_size: int = 1024):
-        """Initialize BERT ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-
-        super().__init__(model)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-
-        self.attention_mask = AttentionMask(self)
-        self.attention_fusion = FusionAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.qordered_attention_fusion = FusionQOrderedAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.utils = FusionUtils(self)
-
-    def fuse_attention(self):
-        self.attention_fusion.apply()
-        FusionAlbertAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        ).apply()
-        FusionVideoBertAttention(self).apply()
-        FusionVITAttention(self).apply()
-        FusionTorchvisionVITAttention(self).apply()
-        FusionSwinLAttention(self).apply()
-        FusionGptAttentionNoPast(self).apply()
-        # Only relevant in models with Q-DQ nodes
-        self.qordered_attention_fusion.apply()
-
-    def fuse_format_roformer(self):
-        FusionRemoveUselessElementwise(self).apply()
-        fusion = FusionFormatInvalidMask(self)
-        fusion.apply()
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_torchvision_vit(self):
-        fusion = FusionTorchvisionVitCustomFC(self)
-        fusion.apply()
-    
-    def fuse_custom_fc_activation(self):
-        fusion = FusionCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_custom_fc_gpt2_classify(self):
-        fusion = FusionCustomFCGPT2(self)
-        fusion.apply()
-
-    def fuse_swinT_serial_bias_add(self):
-        fusion = FusionSerialBiasAdd(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_bias_gelu(self, is_fastgelu):
-        fusion = FusionBiasGelu(self, is_fastgelu)
-        fusion.apply()
-
-    def fuse_custom_xsoftmax(self):
-        fusion = FusionXSoftmax(self)
-        fusion.apply()
-
-    def fuse_disentangled_attention(self):
-        fusion = FusionDisentangledAttention(self)
-        fusion.apply()
-
-    def gelu_approximation(self):
-        fusion = FusionGeluApproximation(self)
-        fusion.apply()
-
-    def fuse_add_bias_skip_layer_norm(self):
-        fusion = FusionBiasSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_embed_layer(self):
-        fusion = FusionEmbedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self, self.hidden_size)
-        fusion.apply()
-
-        fusion = FusionLayerNormalizationTF(self)
-        fusion.apply()
-
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_skip_layer_norm(self):
-        fusion = FusionSkipLayerNormalization(self)
-        fusion.apply()
-
-    # Only relevant in models with Q-DQ nodes
-    def fuse_qordered_mamtul(self):
-        fusion = FusionQOrderedMatMul(self)
-        fusion.apply()
-
-    def fuse_omdet_inverse_sigmoid(self):
-        fusion = FusionLayerInverseSigmoid(self)
-        fusion.apply()
-
-    def fuse_omdet_attention(self):
-        fusion = FusionLayerOmdetAttention(self)
-        fusion.apply()
-
-    def fuse_l2_normalization(self):
-        fusion = FusionLayerL2Normalization(self)
-        fusion.apply()
-        
-    def fuse_splitQKV_update_kv_cache(self):        
-        fusion = FusionCosyVoiceSplitQKVUpdateKVCache(self, self.hidden_size, self.num_heads)
-        fusion.apply()
-        
-    def fuse_cosyvoice_attention(self):
-        fusion = FusionCosyvoiceAttention(self)
-        fusion.apply() 
-    
-    def fuse_cosyvoice_split_qkv(self):
-        fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
-        fusion.apply()       
-   
-
-    def get_graph_inputs_from_node_type(
-        self, op_type: str, input_indices: List[int], casted: bool
-    ):
-        """
-        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
-        Returns a list of the graph input names based on the filter whether it is casted or not.
-        """
-        graph_inputs = []
-
-        output_name_to_node = self.output_name_to_node()
-        nodes = self.get_nodes_by_op_type(op_type)
-        for node in nodes:
-            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
-            for bert_input in bert_inputs:
-                if self.find_graph_input(bert_input):
-                    if not casted:
-                        graph_inputs.append(bert_input)
-                elif bert_input in output_name_to_node:
-                    parent = output_name_to_node[bert_input]
-                    if (
-                        parent.op_type == "Cast"
-                        and self.find_graph_input(parent.input[0]) is not None
-                    ):
-                        if casted:
-                            graph_inputs.append(parent.input[0])
-        return graph_inputs
-
-    def get_graph_inputs_from_fused_nodes(self, casted: bool):
-        inputs = self.get_graph_inputs_from_node_type(
-            "EmbedLayerNormalization", [0, 1, 7], casted
-        )
-        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
-        return inputs
-
-    def change_graph_input_type(
-        self,
-        graph: GraphProto,
-        graph_input: ValueInfoProto,
-        new_type: int = TensorProto.INT32,
-    ):
-        """Change graph input type, and add Cast node if needed.
-
-        Args:
-            graph (GraphProto): graph
-            graph_input (TensorProto): input of the graph
-            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
-        Returns:
-            NodeProto: a new Cast node that added. None if Cast node is not added.
-            List[NodeProto]: Cast nodes that have been removed.
-        """
-        assert isinstance(graph, GraphProto)
-        assert isinstance(graph_input, ValueInfoProto)
-        assert self.find_graph_input(graph_input.name)
-
-        if graph_input.type.tensor_type.elem_type == int(new_type):
-            return None, []
-
-        new_cast_node = None
-        nodes_to_remove = []
-
-        input_name_to_nodes = self.input_name_to_nodes()
-        if graph_input.name in input_name_to_nodes:
-            nodes = input_name_to_nodes[graph_input.name]
-
-            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
-            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
-            if nodes_not_cast:
-                node_name = self.create_node_name("Cast")
-                output_name = node_name + "_" + graph_input.name
-                new_value_info = graph.value_info.add()
-                new_value_info.CopyFrom(graph_input)
-                new_value_info.name = output_name
-                new_cast_node = helper.make_node(
-                    "Cast",
-                    [graph_input.name],
-                    [output_name],
-                    to=int(graph_input.type.tensor_type.elem_type),
-                    name=node_name,
-                )
-                graph.node.extend([new_cast_node])
-
-                for node in nodes_not_cast:
-                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
-            # For children that is Cast node, no need to insert Cast.
-            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
-            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
-            for node in nodes_cast:
-                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
-                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
-                if not self.find_graph_output(node.output[0]):
-                    nodes_to_remove.append(node)
-            if nodes_to_remove:
-                self.remove_nodes(nodes_to_remove)
-
-        graph_input.type.tensor_type.elem_type = int(new_type)
-        return new_cast_node, nodes_to_remove
-
-    def change_graph_inputs_to_int32(self):
-        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
-        graph = self.graph()
-        add_cast_count = 0
-        remove_cast_count = 0
-        for graph_input in graph.input:
-            new_node, removed_nodes = self.change_graph_input_type(
-                graph, graph_input, TensorProto.INT32
-            )
-            if new_node:
-                add_cast_count += 1
-            remove_cast_count += len(removed_nodes)
-        logger.info(
-            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
-        )
-
-    def use_dynamic_axes(
-        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
-    ):
-        """
-        Update input and output shape to use dynamic axes.
-        """
-        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
-            casted=True
-        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
-        dynamic_batch_inputs = {}
-        for input in self.model.graph.input:
-            if input.name in bert_graph_inputs:
-                dim_proto = input.type.tensor_type.shape.dim[0]
-                dim_proto.dim_param = dynamic_batch_dim
-                if dynamic_seq_len is not None:
-                    dim_proto = input.type.tensor_type.shape.dim[1]
-                    dim_proto.dim_param = dynamic_seq_len
-
-        for output in self.model.graph.output:
-            dim_proto = output.type.tensor_type.shape.dim[0]
-            dim_proto.dim_param = dynamic_batch_dim
-
-    def preprocess(self):
-        self.adjust_reshape_and_expand()
-        return
-
-    def adjust_reshape_and_expand(self):
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Reshape":
-                # Clean up unneccessary reshape nodes.
-                # Find reshape nodes with no actually data in "shape" attribute and remove.
-                reshape_shape = self.get_constant_value(node.input[1])
-                if reshape_shape is not None and reshape_shape.size == 0:
-                    nodes_to_remove.extend([node])
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    continue
-
-                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
-                # changing current reshape's input to output of slice.
-                reshape_path = self.match_parent_path(
-                    node,
-                    ["Expand", "Expand", "Reshape", "Slice"],
-                    [0, 0, 0, 0],
-                    self.output_name_to_node(),
-                )
-                if reshape_path is not None:
-                    expand_node = reshape_path[-3]
-                    expand_shape_value = self.get_constant_value(expand_node.input[1])
-
-                    reshape_before_expand = reshape_path[-2]
-                    shape_value = self.get_constant_value(
-                        reshape_before_expand.input[1]
-                    )
-
-                    slice_node = reshape_path[-1]
-                    if (
-                        expand_shape_value is not None
-                        and shape_value is not None
-                        and len(expand_shape_value) == 2
-                        and len(shape_value) == 1
-                        and expand_shape_value[1] == shape_value[0]
-                    ):
-                        node.input[0] = slice_node.output[0]
-
-        if nodes_to_remove:
-            self.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
-    def clean_graph(self):
-        output_name_to_node = self.output_name_to_node()
-        nodes_to_remove = []
-        for node in self.nodes():
-            # Before:
-            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
-            #          |                                                     |
-            #          |                                                     v
-            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # After:
-            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
-            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
-            if node.op_type in op_input_id:
-                i = op_input_id[node.op_type]
-                parent_nodes = self.match_parent_path(
-                    node,
-                    [
-                        "Cast",
-                        "ConstantOfShape",
-                        "Concat",
-                        "Unsqueeze",
-                        "Gather",
-                        "Shape",
-                    ],
-                    [i, 0, 0, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    (
-                        cast,
-                        constantOfShape,
-                        concat,
-                        unsqueeze,
-                        gather,
-                        shape,
-                    ) = parent_nodes
-                    if shape.input[0] == self.graph().input[0].name:
-                        constantOfShape.input[0] = shape.output[0]
-                        output_name_to_node = self.output_name_to_node()
-
-            if node.op_type == "Attention":
-                # Before:
-                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
-                # After:
-                #   remove this path, and remove the optional mask_index input of Attention node.
-                parent_nodes = self.match_parent_path(
-                    node,
-                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
-                    [3, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
-                        attention_node = helper.make_node(
-                            "Attention",
-                            inputs=node.input[0 : len(node.input) - 1],
-                            outputs=node.output,
-                            name=node.name + "_remove_mask",
-                        )
-                        attention_node.domain = "com.microsoft"
-                        attention_node.attribute.extend(
-                            [helper.make_attribute("num_heads", self.num_heads)]
-                        )
-                        self.add_node(
-                            attention_node, self.get_graph_by_node(attention_node).name
-                        )
-                        nodes_to_remove.append(node)
-        self.remove_nodes(nodes_to_remove)
-
-    def postprocess(self):
-        self.clean_graph()
-        self.prune_graph()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.preprocess()
-
-        self.fuse_reshape()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        if options.enable_swint_opt:
-            self.fuse_custom_fc()
-            self.fuse_swinT_serial_bias_add()
-
-        if options.enable_format_roformer:
-            self.fuse_format_roformer()
-
-        if options.enable_gpt2_classify or options.enable_vit:
-            self.fuse_custom_fc_gpt2_classify()
-
-        if options.enable_vit:
-            self.fuse_custom_fc()
-
-        if (options is None) or options.enable_attention:
-            if options is not None:
-                self.attention_mask.set_mask_format(options.attention_mask_format)
-            self.fuse_attention()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        self.fuse_custom_fc()
-        
-        if options.enable_omdet:
-            self.fuse_omdet_attention()
-            self.fuse_omdet_inverse_sigmoid()
-            self.fuse_l2_normalization()
-            
-        self.fuse_splitQKV_update_kv_cache()
-        self.fuse_cosyvoice_attention()
-        self.fuse_cosyvoice_split_qkv()
-        
-        
-        # Perform the MatMul fusion after the Attention fusion as we do not
-        # want to fuse the MatMuls inside the Attention subgraphs
-        if (options is None) or options.enable_qordered_matmul:
-            self.fuse_qordered_mamtul()
-
-        self.fuse_shape()
-
-        if (options is None) or options.enable_embed_layer_norm:
-            self.fuse_embed_layer()
-
-        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
-        self.utils.remove_useless_reshape_nodes()
-
-        self.postprocess()
-
-        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
-        if (options is None) or options.enable_bias_gelu:
-            # Fuse Gelu and Add Bias before it.
-            self.fuse_bias_gelu(is_fastgelu=True)
-            self.fuse_bias_gelu(is_fastgelu=False)
-
-        if (options is None) or options.enable_bias_skip_layer_norm:
-            # Fuse SkipLayerNormalization and Add Bias before it.
-            self.fuse_add_bias_skip_layer_norm()
-
-        if options is not None and options.enable_gelu_approximation:
-            self.gelu_approximation()
-
-        self.fuse_custom_fc_activation()
-        
-        if options.enable_vit:
-            self.fuse_custom_fc_torchvision_vit()
-
-        self.remove_unused_constant()
-
-        # Use symbolic batch dimension in input and output.
-        if add_dynamic_axes:
-            self.use_dynamic_axes()
-
-        logger.info(f"opset version: {self.get_opset_version()}")
-
-    def get_fused_operator_statistics(self):
-        """
-        Returns node count of fused operators.
-        """
-        op_count = {}
-        ops = [
-            "EmbedLayerNormalization",
-            "Attention",
-            "QOrderedAttention",
-            "Gelu",
-            "QOrderedGelu",
-            "FastGelu",
-            "BiasGelu",
-            "LayerNormalization",
-            "QOrderedLayerNormalization",
-            "SkipLayerNormalization",
-            "QOrderedMatMul",
-        ]
-        for op in ops:
-            nodes = self.get_nodes_by_op_type(op)
-            op_count[op] = len(nodes)
-        logger.info(f"Optimized operators:{op_count}")
-        return op_count
-
-    def is_fully_optimized(self):
-        """
-        Returns True when the model is fully optimized.
-        """
-        op_count = self.get_fused_operator_statistics()
-        embed = op_count["EmbedLayerNormalization"]
-        attention = op_count["Attention"] + op_count["QOrderedAttention"]
-        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
-        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (
-            (embed > 0)
-            and (attention > 0)
-            and (attention == gelu)
-            and (layer_norm >= 2 * attention)
-        )
-
-        if layer_norm == 0:
-            logger.debug("Layer Normalization not fused")
-
-        if gelu == 0:
-            logger.debug("Gelu/FastGelu not fused")
-
-        if embed == 0:
-            logger.debug("Embed Layer not fused")
-
-        if attention == 0:
-            logger.warning("Attention not fused")
-
-        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
deleted file mode 100644
index 7bffb2e7cbec870423cd006d33a617dd1e70d1fb..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_albert_attention import FusionAlbertAttention
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import (
-    FusionCustomFC,
-    FusionCustomFCActivation,
-    FusionCustomFcRoformer,
-)
-from passes.fusion_disentangled_attention import FusionDisentangledAttention
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
-    FusionFormatInvalidMask,
-    FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_layernorm import (
-    FusionLayerNormalization,
-    FusionLayerNormalizationKeras,
-    FusionLayerNormalizationTF,
-)
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_roformer_attention import FusionRoformerCrossAttention
-from passes.fusion_rope import FusionRoPE
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
-    FusionBiasSkipLayerNormalization,
-    FusionSkipLayerNormalization,
-)
-from passes.fusion_swinl_attention import FusionSwinLAttention
-from passes.fusion_utils import FusionUtils
-from passes.fusion_videobert_attention import FusionVideoBertAttention
-from passes.fusion_vit_attention import FusionVITAttention
-from passes.fusion_xsoftmax import FusionXSoftmax
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class RoformerOnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
-        """Initialize BERT ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-
-        super().__init__(model)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-
-        self.attention_mask = AttentionMask(self)
-        self.attention_fusion = FusionAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.qordered_attention_fusion = FusionQOrderedAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.utils = FusionUtils(self)
-
-    def fuse_attention(self):
-        FusionRoformerCrossAttention(self).apply()
-
-    def fuse_format_roformer(self):
-        # FusionRemoveUselessElementwise(self).apply()
-        fusion = FusionFormatInvalidMask(self)
-        fusion.apply()
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_activation(self):
-        fusion = FusionCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_custom_fc_roformer(self):
-        fusion = FusionCustomFcRoformer(self)
-        fusion.apply()
-
-    def fuse_rope(self):
-        fusion = FusionRoPE(self)
-        fusion.apply()
-
-    def fuse_swinT_serial_bias_add(self):
-        fusion = FusionSerialBiasAdd(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_bias_gelu(self, is_fastgelu):
-        fusion = FusionBiasGelu(self, is_fastgelu)
-        fusion.apply()
-
-    def gelu_approximation(self):
-        fusion = FusionGeluApproximation(self)
-        fusion.apply()
-
-    def fuse_add_bias_skip_layer_norm(self):
-        fusion = FusionBiasSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_embed_layer(self):
-        fusion = FusionEmbedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalizationKeras(self)
-        fusion.apply()
-
-    def fuse_skip_layer_norm(self):
-        fusion = FusionSkipLayerNormalization(self)
-        fusion.apply()
-
-    # Only relevant in models with Q-DQ nodes
-    def fuse_qordered_mamtul(self):
-        fusion = FusionQOrderedMatMul(self)
-        fusion.apply()
-
-    def get_graph_inputs_from_node_type(
-        self, op_type: str, input_indices: List[int], casted: bool
-    ):
-        """
-        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
-        Returns a list of the graph input names based on the filter whether it is casted or not.
-        """
-        graph_inputs = []
-
-        output_name_to_node = self.output_name_to_node()
-        nodes = self.get_nodes_by_op_type(op_type)
-        for node in nodes:
-            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
-            for bert_input in bert_inputs:
-                if self.find_graph_input(bert_input):
-                    if not casted:
-                        graph_inputs.append(bert_input)
-                elif bert_input in output_name_to_node:
-                    parent = output_name_to_node[bert_input]
-                    if (
-                        parent.op_type == "Cast"
-                        and self.find_graph_input(parent.input[0]) is not None
-                    ):
-                        if casted:
-                            graph_inputs.append(parent.input[0])
-        return graph_inputs
-
-    def get_graph_inputs_from_fused_nodes(self, casted: bool):
-        inputs = self.get_graph_inputs_from_node_type(
-            "EmbedLayerNormalization", [0, 1, 7], casted
-        )
-        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
-        return inputs
-
-    def change_graph_input_type(
-        self,
-        graph: GraphProto,
-        graph_input: ValueInfoProto,
-        new_type: int = TensorProto.INT32,
-    ):
-        """Change graph input type, and add Cast node if needed.
-
-        Args:
-            graph (GraphProto): graph
-            graph_input (TensorProto): input of the graph
-            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
-        Returns:
-            NodeProto: a new Cast node that added. None if Cast node is not added.
-            List[NodeProto]: Cast nodes that have been removed.
-        """
-        assert isinstance(graph, GraphProto)
-        assert isinstance(graph_input, ValueInfoProto)
-        assert self.find_graph_input(graph_input.name)
-
-        if graph_input.type.tensor_type.elem_type == int(new_type):
-            return None, []
-
-        new_cast_node = None
-        nodes_to_remove = []
-
-        input_name_to_nodes = self.input_name_to_nodes()
-        if graph_input.name in input_name_to_nodes:
-            nodes = input_name_to_nodes[graph_input.name]
-
-            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
-            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
-            if nodes_not_cast:
-                node_name = self.create_node_name("Cast")
-                output_name = node_name + "_" + graph_input.name
-                new_value_info = graph.value_info.add()
-                new_value_info.CopyFrom(graph_input)
-                new_value_info.name = output_name
-                new_cast_node = helper.make_node(
-                    "Cast",
-                    [graph_input.name],
-                    [output_name],
-                    to=int(graph_input.type.tensor_type.elem_type),
-                    name=node_name,
-                )
-                graph.node.extend([new_cast_node])
-
-                for node in nodes_not_cast:
-                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
-            # For children that is Cast node, no need to insert Cast.
-            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
-            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
-            for node in nodes_cast:
-                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
-                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
-                if not self.find_graph_output(node.output[0]):
-                    nodes_to_remove.append(node)
-            if nodes_to_remove:
-                self.remove_nodes(nodes_to_remove)
-
-        graph_input.type.tensor_type.elem_type = int(new_type)
-        return new_cast_node, nodes_to_remove
-
-    def change_graph_inputs_to_int32(self):
-        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
-        graph = self.graph()
-        add_cast_count = 0
-        remove_cast_count = 0
-        for graph_input in graph.input:
-            new_node, removed_nodes = self.change_graph_input_type(
-                graph, graph_input, TensorProto.INT32
-            )
-            if new_node:
-                add_cast_count += 1
-            remove_cast_count += len(removed_nodes)
-        logger.info(
-            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
-        )
-
-    def use_dynamic_axes(
-        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
-    ):
-        """
-        Update input and output shape to use dynamic axes.
-        """
-        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
-            casted=True
-        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
-        dynamic_batch_inputs = {}
-        for input in self.model.graph.input:
-            if input.name in bert_graph_inputs:
-                dim_proto = input.type.tensor_type.shape.dim[0]
-                dim_proto.dim_param = dynamic_batch_dim
-                if dynamic_seq_len is not None:
-                    dim_proto = input.type.tensor_type.shape.dim[1]
-                    dim_proto.dim_param = dynamic_seq_len
-
-        for output in self.model.graph.output:
-            dim_proto = output.type.tensor_type.shape.dim[0]
-            dim_proto.dim_param = dynamic_batch_dim
-
-    def preprocess(self):
-        self.adjust_reshape_and_expand()
-        return
-
-    def adjust_reshape_and_expand(self):
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Reshape":
-                # Clean up unneccessary reshape nodes.
-                # Find reshape nodes with no actually data in "shape" attribute and remove.
-                reshape_shape = self.get_constant_value(node.input[1])
-                if reshape_shape is not None and reshape_shape.size == 0:
-                    nodes_to_remove.extend([node])
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    continue
-
-                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
-                # changing current reshape's input to output of slice.
-                reshape_path = self.match_parent_path(
-                    node,
-                    ["Expand", "Expand", "Reshape", "Slice"],
-                    [0, 0, 0, 0],
-                    self.output_name_to_node(),
-                )
-                if reshape_path is not None:
-                    expand_node = reshape_path[-3]
-                    expand_shape_value = self.get_constant_value(expand_node.input[1])
-
-                    reshape_before_expand = reshape_path[-2]
-                    shape_value = self.get_constant_value(
-                        reshape_before_expand.input[1]
-                    )
-
-                    slice_node = reshape_path[-1]
-                    if (
-                        expand_shape_value is not None
-                        and shape_value is not None
-                        and len(expand_shape_value) == 2
-                        and len(shape_value) == 1
-                        and expand_shape_value[1] == shape_value[0]
-                    ):
-                        node.input[0] = slice_node.output[0]
-
-        if nodes_to_remove:
-            self.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
-    def clean_graph(self):
-        output_name_to_node = self.output_name_to_node()
-        nodes_to_remove = []
-        for node in self.nodes():
-            # Before:
-            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
-            #          |                                                     |
-            #          |                                                     v
-            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # After:
-            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
-            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
-            if node.op_type in op_input_id:
-                i = op_input_id[node.op_type]
-                parent_nodes = self.match_parent_path(
-                    node,
-                    [
-                        "Cast",
-                        "ConstantOfShape",
-                        "Concat",
-                        "Unsqueeze",
-                        "Gather",
-                        "Shape",
-                    ],
-                    [i, 0, 0, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    (
-                        cast,
-                        constantOfShape,
-                        concat,
-                        unsqueeze,
-                        gather,
-                        shape,
-                    ) = parent_nodes
-                    if shape.input[0] == self.graph().input[0].name:
-                        constantOfShape.input[0] = shape.output[0]
-                        output_name_to_node = self.output_name_to_node()
-
-            if node.op_type == "Attention":
-                # Before:
-                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
-                # After:
-                #   remove this path, and remove the optional mask_index input of Attention node.
-                parent_nodes = self.match_parent_path(
-                    node,
-                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
-                    [3, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
-                        attention_node = helper.make_node(
-                            "Attention",
-                            inputs=node.input[0 : len(node.input) - 1],
-                            outputs=node.output,
-                            name=node.name + "_remove_mask",
-                        )
-                        attention_node.domain = "com.microsoft"
-                        attention_node.attribute.extend(
-                            [helper.make_attribute("num_heads", self.num_heads)]
-                        )
-                        self.add_node(
-                            attention_node, self.get_graph_by_node(attention_node).name
-                        )
-                        nodes_to_remove.append(node)
-        self.remove_nodes(nodes_to_remove)
-
-    def postprocess(self):
-        self.clean_graph()
-        self.prune_graph()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.preprocess()
-
-        self.fuse_reshape()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        if options.enable_format_roformer:
-            self.fuse_format_roformer()
-
-        self.fuse_custom_fc_roformer()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        self.fuse_custom_fc()
-
-        if (options is None) or options.enable_attention:
-            if options is not None:
-                self.attention_mask.set_mask_format(options.attention_mask_format)
-            self.fuse_attention()
-
-        self.fuse_rope()
-
-        self.fuse_shape()
-
-        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
-        self.utils.remove_useless_reshape_nodes()
-
-        self.postprocess()
-
-        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
-        if (options is None) or options.enable_bias_gelu:
-            # Fuse Gelu and Add Bias before it.
-            self.fuse_bias_gelu(is_fastgelu=True)
-            self.fuse_bias_gelu(is_fastgelu=False)
-
-        if (options is None) or options.enable_bias_skip_layer_norm:
-            # Fuse SkipLayerNormalization and Add Bias before it.
-            self.fuse_add_bias_skip_layer_norm()
-
-        if options is not None and options.enable_gelu_approximation:
-            self.gelu_approximation()
-
-        self.fuse_custom_fc_activation()
-
-        self.remove_unused_constant()
-
-        # Use symbolic batch dimension in input and output.
-        if add_dynamic_axes:
-            self.use_dynamic_axes()
-
-        logger.info(f"opset version: {self.get_opset_version()}")
-
-    def get_fused_operator_statistics(self):
-        """
-        Returns node count of fused operators.
-        """
-        op_count = {}
-        ops = [
-            "EmbedLayerNormalization",
-            "Attention",
-            "QOrderedAttention",
-            "Gelu",
-            "QOrderedGelu",
-            "FastGelu",
-            "BiasGelu",
-            "LayerNormalization",
-            "QOrderedLayerNormalization",
-            "SkipLayerNormalization",
-            "QOrderedMatMul",
-        ]
-        for op in ops:
-            nodes = self.get_nodes_by_op_type(op)
-            op_count[op] = len(nodes)
-        logger.info(f"Optimized operators:{op_count}")
-        return op_count
-
-    def is_fully_optimized(self):
-        """
-        Returns True when the model is fully optimized.
-        """
-        op_count = self.get_fused_operator_statistics()
-        embed = op_count["EmbedLayerNormalization"]
-        attention = op_count["Attention"] + op_count["QOrderedAttention"]
-        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
-        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (
-            (embed > 0)
-            and (attention > 0)
-            and (attention == gelu)
-            and (layer_norm >= 2 * attention)
-        )
-
-        if layer_norm == 0:
-            logger.debug("Layer Normalization not fused")
-
-        if gelu == 0:
-            logger.debug("Gelu/FastGelu not fused")
-
-        if embed == 0:
-            logger.debug("Embed Layer not fused")
-
-        if attention == 0:
-            logger.warning("Attention not fused")
-
-        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
deleted file mode 100644
index dac070d24a66812c4b14cfeff5b7c78ff44c6711..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
+++ /dev/null
@@ -1,550 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-import onnx
-from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
-from passes.fusion_attention import AttentionMask, FusionAttention
-from passes.fusion_biasgelu import FusionBiasGelu
-from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
-from passes.fusion_embedlayer import FusionEmbedLayerNormalization
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_qordered_matmul import FusionQOrderedMatMul
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_rms_norm import FusionRMSNorm
-from passes.fusion_shape import FusionShape
-from passes.fusion_skiplayernorm import (
-    FusionBiasSkipLayerNormalization,
-    FusionSkipLayerNormalization,
-)
-from passes.fusion_splitQKV_update_KVcache import FusionSplitQKVUpdateKVCache
-from passes.fusion_t5_attention import (
-    FusionT5DecoderAttention,
-    FusionT5EncoderAttention,
-)
-from passes.fusion_utils import FusionUtils
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class BertOptimizationOptions(FusionOptions):
-    """This class is deprecated"""
-
-    def __init__(self, model_type):
-        logger.warning(
-            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
-        )
-        super().__init__(model_type)
-
-
-class T5OnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads=12, hidden_size=768):
-        """Initialize T5 ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-
-        super().__init__(model)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.attention_mask = AttentionMask(self)
-        self.attention_fusion = FusionAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.qordered_attention_fusion = FusionQOrderedAttention(
-            self, self.hidden_size, self.num_heads, self.attention_mask
-        )
-        self.utils = FusionUtils(self)
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_activation(self):
-        fusion = FusionCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_bias_gelu(self, is_fastgelu):
-        fusion = FusionBiasGelu(self, is_fastgelu)
-        fusion.apply()
-
-    def gelu_approximation(self):
-        fusion = FusionGeluApproximation(self)
-        fusion.apply()
-
-    def fuse_add_bias_skip_layer_norm(self):
-        fusion = FusionBiasSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_embed_layer(self):
-        fusion = FusionEmbedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_rms_norm(self):
-        fusion = FusionRMSNorm(self)
-        fusion.apply()
-
-    def fuse_t5_encoder_attention(self):
-        fusion = FusionT5EncoderAttention(self)
-        fusion.apply()
-
-    def fuse_t5_decoder_attention(self):
-        fusion = FusionT5DecoderAttention(self)
-        fusion.apply()
-        # pass
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self, hidden_size=768)
-        fusion.apply()
-
-        fusion = FusionLayerNormalizationTF(self)
-        fusion.apply()
-
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_skip_layer_norm(self):
-        fusion = FusionSkipLayerNormalization(self)
-        fusion.apply()
-
-    def fuse_splitQKV_update_kv_cache(self):
-        fusion = FusionSplitQKVUpdateKVCache(self, self.hidden_size, self.num_heads)
-        fusion.apply()
-
-    # Only relevant in models with Q-DQ nodes
-    def fuse_qordered_mamtul(self):
-        fusion = FusionQOrderedMatMul(self)
-        fusion.apply()
-
-    def get_graph_inputs_from_node_type(
-        self, op_type: str, input_indices: List[int], casted: bool
-    ):
-        """
-        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
-        Returns a list of the graph input names based on the filter whether it is casted or not.
-        """
-        graph_inputs = []
-
-        output_name_to_node = self.output_name_to_node()
-        nodes = self.get_nodes_by_op_type(op_type)
-        for node in nodes:
-            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
-            for bert_input in bert_inputs:
-                if self.find_graph_input(bert_input):
-                    if not casted:
-                        graph_inputs.append(bert_input)
-                elif bert_input in output_name_to_node:
-                    parent = output_name_to_node[bert_input]
-                    if (
-                        parent.op_type == "Cast"
-                        and self.find_graph_input(parent.input[0]) is not None
-                    ):
-                        if casted:
-                            graph_inputs.append(parent.input[0])
-        return graph_inputs
-
-    def get_graph_inputs_from_fused_nodes(self, casted: bool):
-        inputs = self.get_graph_inputs_from_node_type(
-            "EmbedLayerNormalization", [0, 1, 7], casted
-        )
-        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
-        return inputs
-
-    def change_graph_input_type(
-        self,
-        graph: GraphProto,
-        graph_input: ValueInfoProto,
-        new_type: int = TensorProto.INT32,
-    ):
-        """Change graph input type, and add Cast node if needed.
-
-        Args:
-            graph (GraphProto): graph
-            graph_input (TensorProto): input of the graph
-            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
-
-        Returns:
-            NodeProto: a new Cast node that added. None if Cast node is not added.
-            List[NodeProto]: Cast nodes that have been removed.
-        """
-        assert isinstance(graph, GraphProto)
-        assert isinstance(graph_input, ValueInfoProto)
-        assert self.find_graph_input(graph_input.name)
-
-        if graph_input.type.tensor_type.elem_type == int(new_type):
-            return None, []
-
-        new_cast_node = None
-        nodes_to_remove = []
-
-        input_name_to_nodes = self.input_name_to_nodes()
-        if graph_input.name in input_name_to_nodes:
-            nodes = input_name_to_nodes[graph_input.name]
-
-            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
-            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
-            if nodes_not_cast:
-                node_name = self.create_node_name("Cast")
-                output_name = node_name + "_" + graph_input.name
-                new_value_info = graph.value_info.add()
-                new_value_info.CopyFrom(graph_input)
-                new_value_info.name = output_name
-                new_cast_node = helper.make_node(
-                    "Cast",
-                    [graph_input.name],
-                    [output_name],
-                    to=int(graph_input.type.tensor_type.elem_type),
-                    name=node_name,
-                )
-                graph.node.extend([new_cast_node])
-
-                for node in nodes_not_cast:
-                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
-
-            # For children that is Cast node, no need to insert Cast.
-            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
-            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
-            for node in nodes_cast:
-                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
-                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
-                if not self.find_graph_output(node.output[0]):
-                    nodes_to_remove.append(node)
-            if nodes_to_remove:
-                self.remove_nodes(nodes_to_remove)
-
-        graph_input.type.tensor_type.elem_type = int(new_type)
-        return new_cast_node, nodes_to_remove
-
-    def change_graph_inputs_to_int32(self):
-        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
-        graph = self.graph()
-        add_cast_count = 0
-        remove_cast_count = 0
-        for graph_input in graph.input:
-            new_node, removed_nodes = self.change_graph_input_type(
-                graph, graph_input, TensorProto.INT32
-            )
-            if new_node:
-                add_cast_count += 1
-            remove_cast_count += len(removed_nodes)
-        logger.info(
-            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
-        )
-
-    def use_dynamic_axes(
-        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
-    ):
-        """
-        Update input and output shape to use dynamic axes.
-        """
-        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
-            casted=True
-        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
-
-        dynamic_batch_inputs = {}
-        for input in self.model.graph.input:
-            if input.name in bert_graph_inputs:
-                dim_proto = input.type.tensor_type.shape.dim[0]
-                dim_proto.dim_param = dynamic_batch_dim
-                if dynamic_seq_len is not None:
-                    dim_proto = input.type.tensor_type.shape.dim[1]
-                    dim_proto.dim_param = dynamic_seq_len
-
-        for output in self.model.graph.output:
-            dim_proto = output.type.tensor_type.shape.dim[0]
-            dim_proto.dim_param = dynamic_batch_dim
-
-    def preprocess(self):
-        self.adjust_reshape_and_expand()
-        return
-
-    def adjust_reshape_and_expand(self):
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Reshape":
-                # Clean up unneccessary reshape nodes.
-                # Find reshape nodes with no actually data in "shape" attribute and remove.
-                reshape_shape = self.get_constant_value(node.input[1])
-                if reshape_shape is not None and reshape_shape.size == 0:
-                    nodes_to_remove.extend([node])
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    continue
-
-                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
-                # changing current reshape's input to output of slice.
-                reshape_path = self.match_parent_path(
-                    node,
-                    ["Expand", "Expand", "Reshape", "Slice"],
-                    [0, 0, 0, 0],
-                    self.output_name_to_node(),
-                )
-                if reshape_path is not None:
-                    expand_node = reshape_path[-3]
-                    expand_shape_value = self.get_constant_value(expand_node.input[1])
-
-                    reshape_before_expand = reshape_path[-2]
-                    shape_value = self.get_constant_value(
-                        reshape_before_expand.input[1]
-                    )
-
-                    slice_node = reshape_path[-1]
-                    if (
-                        expand_shape_value is not None
-                        and shape_value is not None
-                        and len(expand_shape_value) == 2
-                        and len(shape_value) == 1
-                        and expand_shape_value[1] == shape_value[0]
-                    ):
-                        node.input[0] = slice_node.output[0]
-
-        if nodes_to_remove:
-            self.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
-
-    def clean_graph(self):
-        output_name_to_node = self.output_name_to_node()
-        nodes_to_remove = []
-        for node in self.nodes():
-            # Before:
-            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
-            #          |                                                     |
-            #          |                                                     v
-            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # After:
-            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
-            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
-            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
-            if node.op_type in op_input_id:
-                i = op_input_id[node.op_type]
-                parent_nodes = self.match_parent_path(
-                    node,
-                    [
-                        "Cast",
-                        "ConstantOfShape",
-                        "Concat",
-                        "Unsqueeze",
-                        "Gather",
-                        "Shape",
-                    ],
-                    [i, 0, 0, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    (
-                        cast,
-                        constantOfShape,
-                        concat,
-                        unsqueeze,
-                        gather,
-                        shape,
-                    ) = parent_nodes
-                    if shape.input[0] == self.graph().input[0].name:
-                        constantOfShape.input[0] = shape.output[0]
-                        output_name_to_node = self.output_name_to_node()
-
-            if node.op_type == "Attention":
-                # Before:
-                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
-                # After:
-                #   remove this path, and remove the optional mask_index input of Attention node.
-                parent_nodes = self.match_parent_path(
-                    node,
-                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
-                    [3, 0, 0, 0],
-                    output_name_to_node,
-                )
-                if parent_nodes is not None:
-                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
-                        attention_node = helper.make_node(
-                            "Attention",
-                            inputs=node.input[0 : len(node.input) - 1],
-                            outputs=node.output,
-                            name=node.name + "_remove_mask",
-                        )
-                        attention_node.domain = "com.microsoft"
-                        attention_node.attribute.extend(
-                            [helper.make_attribute("num_heads", self.num_heads)]
-                        )
-                        self.add_node(
-                            attention_node, self.get_graph_by_node(attention_node).name
-                        )
-                        nodes_to_remove.append(node)
-        self.remove_nodes(nodes_to_remove)
-
-    def postprocess(self):
-        self.clean_graph()
-        self.prune_graph()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.preprocess()
-
-        self.fuse_reshape()
-
-        if (options is None) or options.enable_skip_layer_norm:
-            self.fuse_skip_layer_norm()
-
-        # Perform the MatMul fusion after the Attention fusion as we do not
-        # want to fuse the MatMuls inside the Attention subgraphs
-        if (options is None) or options.enable_qordered_matmul:
-            self.fuse_qordered_mamtul()
-
-        self.fuse_shape()
-
-        self.fuse_rms_norm()
-
-        self.fuse_t5_encoder_attention()
-
-        self.fuse_t5_decoder_attention()
-
-        self.fuse_splitQKV_update_kv_cache()
-
-        if (options is None) or options.enable_embed_layer_norm:
-            self.fuse_embed_layer()
-
-        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
-        self.utils.remove_useless_reshape_nodes()
-
-        self.postprocess()
-
-        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
-        if (options is None) or options.enable_bias_gelu:
-            # Fuse Gelu and Add Bias before it.
-            self.fuse_bias_gelu(is_fastgelu=True)
-            self.fuse_bias_gelu(is_fastgelu=False)
-
-        if (options is None) or options.enable_bias_skip_layer_norm:
-            # Fuse SkipLayerNormalization and Add Bias before it.
-            self.fuse_add_bias_skip_layer_norm()
-
-        if options is not None and options.enable_gelu_approximation:
-            self.gelu_approximation()
-
-        self.remove_unused_constant()
-
-        # Use symbolic batch dimension in input and output.
-        if add_dynamic_axes:
-            self.use_dynamic_axes()
-
-        logger.info(f"opset version: {self.get_opset_version()}")
-
-    def get_fused_operator_statistics(self):
-        """
-        Returns node count of fused operators.
-        """
-        op_count = {}
-        ops = [
-            "EmbedLayerNormalization",
-            "Attention",
-            "QOrderedAttention",
-            "Gelu",
-            "QOrderedGelu",
-            "FastGelu",
-            "BiasGelu",
-            "LayerNormalization",
-            "QOrderedLayerNormalization",
-            "SkipLayerNormalization",
-            "QOrderedMatMul",
-        ]
-        for op in ops:
-            nodes = self.get_nodes_by_op_type(op)
-            op_count[op] = len(nodes)
-        logger.info(f"Optimized operators:{op_count}")
-        return op_count
-
-    def is_fully_optimized(self):
-        """
-        Returns True when the model is fully optimized.
-        """
-        op_count = self.get_fused_operator_statistics()
-        embed = op_count["EmbedLayerNormalization"]
-        attention = op_count["Attention"] + op_count["QOrderedAttention"]
-        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
-        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (
-            (embed > 0)
-            and (attention > 0)
-            and (attention == gelu)
-            and (layer_norm >= 2 * attention)
-        )
-
-        if layer_norm == 0:
-            logger.debug("Layer Normalization not fused")
-
-        if gelu == 0:
-            logger.debug("Gelu/FastGelu not fused")
-
-        if embed == 0:
-            logger.debug("Embed Layer not fused")
-
-        if attention == 0:
-            logger.warning("Attention not fused")
-
-        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
deleted file mode 100644
index 42b504c42edfc006b5efac0d385001780d296fb2..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Optional
-
-from onnx import ModelProto
-from passes.fuse_series_bias_add import FusionSerialBiasAdd
-from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
-from passes.fusion_fastgelu import FusionFastGelu
-from passes.fusion_format_roformer import (
-    FusionFormatInvalidMask,
-    FusionRemoveUselessElementwise,
-)
-from passes.fusion_gelu import FusionGelu
-from passes.fusion_gelu_approximation import FusionGeluApproximation
-from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
-from passes.fusion_options import FusionOptions
-from passes.fusion_qordered_attention import FusionQOrderedAttention
-from passes.fusion_qordered_gelu import FusionQOrderedGelu
-from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
-from passes.fusion_reshape import FusionReshape
-from passes.fusion_shape import FusionShape
-from passes.fusion_utils import FusionUtils
-from passes.fusion_yolov5_decoder import FusionYoloV5Decoder
-from passes.onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class YoloOnnxModel(OnnxModel):
-    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
-        """Initialize BERT ONNX Model.
-
-        Args:
-            model (ModelProto): the ONNX model
-            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
-            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
-        """
-        assert (num_heads == 0 and hidden_size == 0) or (
-            num_heads > 0 and hidden_size % num_heads == 0
-        )
-        super().__init__(model)
-        self.utils = FusionUtils(self)
-
-    def fuse_format_roformer(self):
-        FusionRemoveUselessElementwise(self).apply()
-        fusion = FusionFormatInvalidMask(self)
-        fusion.apply()
-
-    def fuse_custom_fc(self):
-        fusion = FusionCustomFC(self)
-        fusion.apply()
-
-    def fuse_custom_fc_activation(self):
-        fusion = FusionCustomFCActivation(self)
-        fusion.apply()
-
-    def fuse_swinT_serial_bias_add(self):
-        fusion = FusionSerialBiasAdd(self)
-        fusion.apply()
-
-    def fuse_gelu(self):
-        fusion = FusionGelu(self)
-        fusion.apply()
-        fusion = FusionFastGelu(self)
-        fusion.apply()
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedGelu(self)
-        fusion.apply()
-
-    def fuse_reshape(self):
-        fusion = FusionReshape(self)
-        fusion.apply()
-
-    def fuse_shape(self):
-        fusion = FusionShape(self)
-        fusion.apply()
-
-    def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self, 0)
-        fusion.apply()
-
-        fusion = FusionLayerNormalizationTF(self)
-        fusion.apply()
-
-        # Only relevant in models with Q-DQ nodes
-        fusion = FusionQOrderedLayerNormalization(self)
-        fusion.apply()
-
-    def optimize(
-        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
-    ):
-        if (options is not None) and not options.enable_shape_inference:
-            self.disable_shape_inference()
-
-        self.utils.remove_identity_nodes()
-
-        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
-        self.utils.remove_useless_cast_nodes()
-
-        if (options is None) or options.enable_layer_norm:
-            self.fuse_layer_norm()
-
-        if (options is None) or options.enable_gelu:
-            self.fuse_gelu()
-
-        self.fuse_reshape()
-
-        FusionYoloV5Decoder(self).apply()
-        self.remove_unused_constant()
-        logger.info(f"opset version: {self.get_opset_version()}")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
deleted file mode 100644
index dc823d366b327141bd5646e7d3aef153349cea8e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# IxRT optimizer
-
-## 1. optimizer 简介
-`optimizer` 是一个 ixrt 中集成的图融合工具，用于将onnx图中的op融合成对应的ixrt plugin；
-
-## 2. optimizer 功能说明
-| 功能           | 说明  |
-| -------------- | ---- |
-| 多 batchsize 支持 | 支持设置不同 batchsize 进行推理测试 |
-| 动态图支持 | 支持融合动态图和静态图 |
-| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert等模型 |
-
-## 3. optimizer 运行参数
-| 参数           | 说明  |
-| -------------- | ---- |
-| `--onnx`       | 必选 ，指定要运行的 onnx 模型路径 |
-| `--num_heads`  | 可选 ，指定模型对应Attention模块注意力头的个数 |
-|`--hidden_size`    | 可选， 模型模型隐藏层的大小|
-|`--input_shapes` | 可选 ，指定模型输入数据类型，示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
-| `--dump_onnx` | 可选 ，用于图融合过程中dump出中间的onnx图 |
-|`--model_type`        | 可选 ，可以指定要融合的模型类型，默认是"bert", 可选["bert", "swint", "roformer"]|
-|`--log_level`     |可选 ，指定ixrt运行时显示日志的等级， 可指定为debug、info、error，默认为 info|
-
-
-## 4. 运行示例
-
-###  4.1 示例1：融合albert|videobert|roberta|deberta
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH}
-```
-
-###  4.2 示例2：融合swinL
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
-```
-
-###  4.3 示例3：融合roformer
-```bash
-cd oss/tools/optimizer
-python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
-```
-
-### 4.4 精度验证
-
-请参考[高级话题](5_advanced_topics.md)中的<u>精度对比工具</u>一节，了解详细使用方法和原理。
-
-也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
-
-具体使用方法可以参考oss/samples
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
deleted file mode 100644
index 0f301e3a58e14713c7ebb26342a6fb39ecdca80e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-import argparse
-import logging
-import time
-from typing import Dict, Optional
-
-import onnx
-from onnx import ModelProto, helper, load_model
-from onnx_model_bert import BertOnnxModel
-from onnx_model_roformer import RoformerOnnxModel
-from onnx_model_conformer import conformerOnnxModel
-from onnx_model_t5 import T5OnnxModel
-from onnx_model_yolo import YoloOnnxModel
-from onnx_model_PVT import PVTOnnxModel
-from onnx_model_cosyvoice import cosyvoiceOnnxModel
-
-
-from onnxsim import simplify
-from passes.fusion_options import FusionOptions
-from passes.symbolic_shape_infer import SymbolicShapeInference
-
-logger = logging.getLogger(__name__)
-MODEL_TYPES = {
-    "bert": (BertOnnxModel, None, "pytorch", 1),
-    "swint": (BertOnnxModel, None, "pytorch", 1),
-    "roformer": (RoformerOnnxModel, None, "tf2onnx", 1),
-    "gpt2": (BertOnnxModel, None, "pytorch", 1),
-    "t5": (T5OnnxModel, None, "tf2onnx", 1),
-    "yolo": (YoloOnnxModel, None, "pytorch", 1),
-    "vit": (BertOnnxModel, None, "pytorch", 1),
-    "conformer": (conformerOnnxModel, None, "pytorch", 1),
-    "PVT": (PVTOnnxModel, None, "pytorch", 1),
-    "omdet": (BertOnnxModel, None, "pytorch", 1),
-    "cosyvoice": (cosyvoiceOnnxModel, None, "pytorch", 1)
-    
-}
-
-
-def optimize_by_fusion(
-    model: ModelProto,
-    model_type: str = "bert",
-    num_heads: int = 0,
-    hidden_size: int = 0,
-    optimization_options: Optional[FusionOptions] = None,
-):
-    """Optimize Model by graph fusion logic.
-
-    Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
-    constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.
-
-    For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
-
-    Args:
-        model (ModelProto): model object
-        model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
-        num_heads (int, optional): number of attention heads. Defaults to 0.
-                                   0 allows detect the parameter from graph automatically (for model_type "bert" only).
-        hidden_size (int, optional): hidden size. Defaults to 0.
-                                     0 allows detect the parameter from graph automatically (for model_type "bert" only).
-        optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None.
-
-     Returns:
-        object of an optimizer class.
-    """
-    if model_type != "bert" and (num_heads == 0 or hidden_size == 0):
-        logger.warning(
-            "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'"
-        )
-
-    (optimizer_class, transformer_class, producer, _) = MODEL_TYPES[model_type]
-
-    if model.producer_name and producer != model.producer_name:
-        logger.warning(
-            f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".'
-            "Please specify correct --model_type parameter."
-        )
-
-    if optimization_options is None:
-        optimization_options = FusionOptions(model_type)
-
-    optimizer = optimizer_class(model, num_heads, hidden_size)
-
-    optimizer.optimize(optimization_options)
-
-    optimizer.topological_sort()
-
-    return optimizer, transformer_class
-
-
-def optimize_to_ixrt(args):
-    onnx_name = args.onnx[:-5]
-    model = onnx.load(args.onnx)
-    if not args.not_sim:
-        logger.info("simplify..")
-        simplified_model, check = simplify(model)
-        logger.info("simplify model end...")
-        if args.dump_onnx:
-            onnx.save(simplified_model, onnx_name + "_sim.onnx")
-
-        # transfer to static shape and optimize it
-        static_sim_model = simplified_model
-        if args.input_shapes:
-            for input_tensor in simplified_model.graph.input:
-                if input_tensor.name in args.input_shapes.keys():
-                    new_shape = args.input_shapes[input_tensor.name]
-                    dim_list = []
-                    for dim in new_shape:
-                        if isinstance(dim, int):
-                            dim_proto = onnx.TensorShapeProto.Dimension()
-                            dim_proto.dim_value = dim
-                            dim_list.append(dim_proto)
-                        elif isinstance(dim, str):
-                            dim_proto = onnx.TensorShapeProto.Dimension()
-                            dim_proto.dim_param = dim
-                            dim_list.append(dim_proto)
-
-                    del input_tensor.type.tensor_type.shape.dim[:]
-                    input_tensor.type.tensor_type.shape.dim.extend(dim_list)
-
-        try:
-            auto_merge = False
-            if args.model_type in ["roformer"]:
-                auto_merge = True
-            static_model = SymbolicShapeInference.infer_shapes(
-                simplified_model, 2**31 - 1, auto_merge, False, 3
-            )
-            static_sim_model, check = simplify(static_model)
-            if args.dump_onnx:
-                onnx.save(static_sim_model, onnx_name + "_sim_static_sim.onnx")
-        except Exception as e:
-            static_model = static_sim_model = simplified_model
-
-        if args.dump_onnx:
-            onnx.save(static_model, onnx_name + "_sim_static.onnx")
-    if args.not_sim:
-        static_sim_model = model
-
-    logger.info("start fusion..")
-    opt_model, _ = optimize_by_fusion(
-        static_sim_model, args.model_type, args.num_heads, args.hidden_size
-    )
-    opt_model.save_model_to_file(onnx_name + "_end.onnx")
-    logger.info("done..")
-
-
-def parse_params(params_str):
-    params = {}
-    for item in params_str.replace(" ", "").split(","):
-        key, value = item.split(":")
-        params[key] = [int(x) if x.isdigit() else x for x in value.split("x")]
-    return params
-
-
-def args_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--onnx", type=str, default=None, required=False, help="ONNX model file path"
-    )
-    parser.add_argument(
-        "--num_heads",
-        type=int,
-        default=0,
-        help="Used in model optimization. The num of the head used in the network",
-    )
-    parser.add_argument(
-        "--hidden_size",
-        type=int,
-        default=0,
-        help="Used in model optimization. The hidden_size used in the network",
-    )
-    parser.add_argument(
-        "--input_shapes",
-        type=parse_params,
-        help='Static input_shapes to the inference, format is --input_shapes "input_name1:3x224x224, input_name2:3x224x224"',
-    )
-    parser.add_argument(
-        "--dump_onnx",
-        action="store_true",
-        help="Whether to dump onnx",
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default="bert",
-        choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2", "vit", "conformer","PVT","omdet","cosyvoice"],
-        help="Which kind of model to optimize",
-    )
-    parser.add_argument(
-        "--log_level",
-        type=str,
-        default="info",
-        choices=["debug", "info", "error"],
-        help="Which kind of model to optimize",
-    )
-
-    parser.add_argument(
-        "--not_sim",
-        action="store_true",
-        default=False,
-        help="simplify model or not",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = args_parser()
-    if args.log_level == "info":
-        logging.basicConfig(level=logging.INFO)
-    elif args.log_level == "debug":
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.ERROR)
-    optimize_to_ixrt(args)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
deleted file mode 100644
index de522e5b082b122a28b0a0423a40909598aa82d5..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
deleted file mode 100644
index 96da8751b0200bb8610e3dd5070f26ebc51e97ac..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
+++ /dev/null
@@ -1,477 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
-# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
-
-import itertools
-import logging
-from typing import Dict, List
-
-import numpy as np
-import onnx
-from onnx import helper, numpy_helper
-from onnx import onnx_pb as onnx_proto
-from packaging import version
-
-logger = logging.getLogger(__name__)
-
-
-def _npfloat16_to_int(np_list):
-    """
-    Convert numpy float16 to python int.
-
-    :param np_list: numpy float16 list
-    :return int_list: python int list
-    """
-    return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
-
-
-def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
-    """
-    Convert float32 numpy array to float16 without changing sign or finiteness.
-    Positive values less than min_positive_val are mapped to min_positive_val.
-    Positive finite values greater than max_finite_val are mapped to max_finite_val.
-    Similar for negative values. NaN, 0, inf, and -inf are unchanged.
-    """
-
-    def between(a, b, c):
-        return np.logical_and(a < b, b < c)
-
-    np_array = np.where(
-        between(0, np_array, min_positive_val), min_positive_val, np_array
-    )
-    np_array = np.where(
-        between(-min_positive_val, np_array, 0), -min_positive_val, np_array
-    )
-    np_array = np.where(
-        between(max_finite_val, np_array, float("inf")), max_finite_val, np_array
-    )
-    np_array = np.where(
-        between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array
-    )
-    return np.float16(np_array)
-
-
-def convert_tensor_float_to_float16(
-    tensor, min_positive_val=5.96e-08, max_finite_val=65504.0
-):
-    """Convert tensor float to float16.
-
-    Args:
-        tensor (TensorProto): the tensor to convert.
-        min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
-        max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
-
-    Raises:
-        ValueError: input type is not TensorProto.
-
-    Returns:
-        TensorProto: the converted tensor.
-    """
-
-    if not isinstance(tensor, onnx_proto.TensorProto):
-        raise ValueError(
-            "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
-        )
-
-    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
-        tensor.data_type = onnx_proto.TensorProto.FLOAT16
-        # convert float_data (float type) to float16 and write to int32_data
-        if tensor.float_data:
-            float16_data = convert_np_to_float16(
-                np.array(tensor.float_data), min_positive_val, max_finite_val
-            )
-            int_list = _npfloat16_to_int(float16_data)
-            tensor.int32_data[:] = int_list
-            tensor.float_data[:] = []
-        # convert raw_data (bytes type)
-        if tensor.raw_data:
-            # convert n.raw_data to float
-            float32_list = np.frombuffer(tensor.raw_data, dtype="float32")
-            # convert float to float16
-            float16_list = convert_np_to_float16(
-                float32_list, min_positive_val, max_finite_val
-            )
-            # convert float16 to bytes and write back to raw_data
-            tensor.raw_data = float16_list.tobytes()
-    return tensor
-
-
-def make_value_info_from_tensor(tensor):
-    shape = numpy_helper.to_array(tensor).shape
-    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
-
-
-DEFAULT_OP_BLOCK_LIST = [
-    "ArrayFeatureExtractor",
-    "Binarizer",
-    "CastMap",
-    "CategoryMapper",
-    "DictVectorizer",
-    "FeatureVectorizer",
-    "Imputer",
-    "LabelEncoder",
-    "LinearClassifier",
-    "LinearRegressor",
-    "Normalizer",
-    "OneHotEncoder",
-    "SVMClassifier",
-    "SVMRegressor",
-    "Scaler",
-    "TreeEnsembleClassifier",
-    "TreeEnsembleRegressor",
-    "ZipMap",
-    "NonMaxSuppression",
-    "TopK",
-    "RoiAlign",
-    "Resize",
-    "Range",
-    "CumSum",
-    "Min",
-    "Max",
-    "Upsample",
-]
-
-
-class InitializerTracker:
-    """Class for keeping track of initializer."""
-
-    def __init__(self, initializer: onnx_proto.TensorProto):
-        self.initializer = initializer
-        self.fp32_nodes = []
-        self.fp16_nodes = []
-
-    def add_node(self, node: onnx_proto.NodeProto, is_node_blocked):
-        if is_node_blocked:
-            self.fp32_nodes.append(node)
-        else:
-            self.fp16_nodes.append(node)
-
-
-def convert_float_to_float16(
-    model,
-    min_positive_val=5.96e-08,
-    max_finite_val=65504.0,
-    keep_io_types=False,
-    disable_shape_infer=False,
-    op_block_list=None,
-    node_block_list=None,
-    force_fp16_initializers=False,
-):
-    """Convert model tensor float type in the ONNX ModelProto input to tensor float16.
-
-    Args:
-        model (ModelProto): The ONNX model to convert.
-        min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.
-        max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.
-        keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
-                                                          If True, model inputs/outputs should be left as float32. Defaults to False.
-        disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False.
-        op_block_list (List[str], optional): List of op types to leave as float32.
-                                             Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
-        node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
-        force_fp16_initializers(bool): force converting all float initializers to float16.
-                                       Default to false, which will convert only the one needed to avoid precision loss.
-    Raises:
-        ValueError: input type is not ModelProto.
-
-    Returns:
-        ModelProto: converted model.
-    """
-    assert (
-        min_positive_val >= 5.96e-08
-    ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
-    assert max_finite_val <= float(
-        np.finfo(np.float16).max
-    ), "invalid max_finite_val. largest float16 value: 65504"
-
-    func_infer_shape = None
-    if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse(
-        "1.2.0"
-    ):
-        try:
-            from onnx.shape_inference import infer_shapes
-
-            func_infer_shape = infer_shapes
-        finally:
-            pass
-
-    if not isinstance(model, onnx_proto.ModelProto):
-        raise ValueError(
-            "Expected model type is an ONNX ModelProto but got %s" % type(model)
-        )
-
-    # create blocklists
-    if op_block_list is None:
-        op_block_list = DEFAULT_OP_BLOCK_LIST
-    if node_block_list is None:
-        node_block_list = []
-    op_block_list = set(op_block_list)
-    node_block_list = set(node_block_list)
-
-    logger.debug(
-        f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"
-    )
-
-    # create a queue for BFS
-    queue = []
-    value_info_list = []
-    node_list = []
-    # type inference on input model
-    if func_infer_shape is not None:
-        model = func_infer_shape(model)
-    queue.append(model)
-    name_mapping = {}
-    graph_io_to_skip = set()
-    io_casts = set()
-
-    fp32_inputs = [
-        n.name
-        for n in model.graph.input
-        if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT
-    ]
-    fp32_outputs = [
-        n.name
-        for n in model.graph.output
-        if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT
-    ]
-    if isinstance(keep_io_types, list):
-        fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
-        fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
-    elif not keep_io_types:
-        fp32_inputs = []
-        fp32_outputs = []
-
-    for i, n in enumerate(model.graph.input):
-        if n.name in fp32_inputs:
-            output_name = "graph_input_cast_" + str(i)
-            name_mapping[n.name] = output_name
-            graph_io_to_skip.add(n.name)
-
-            node_name = "graph_input_cast" + str(i)
-            new_value_info = model.graph.value_info.add()
-            new_value_info.CopyFrom(n)
-            new_value_info.name = output_name
-            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
-            # add Cast node (from tensor(float) to tensor(float16) after graph input
-            new_node = [
-                helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)
-            ]
-            model.graph.node.extend(new_node)
-            value_info_list.append(new_value_info)
-            io_casts.add(node_name)
-
-    for i, n in enumerate(model.graph.output):
-        if n.name in fp32_outputs:
-            input_name = "graph_output_cast_" + str(i)
-            name_mapping[n.name] = input_name
-            graph_io_to_skip.add(n.name)
-
-            node_name = "graph_output_cast" + str(i)
-            # add Cast node (from tensor(float16) to tensor(float) before graph output
-            new_value_info = model.graph.value_info.add()
-            new_value_info.CopyFrom(n)
-            new_value_info.name = input_name
-            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
-            new_node = [
-                helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)
-            ]
-            model.graph.node.extend(new_node)
-            value_info_list.append(new_value_info)
-            io_casts.add(node_name)
-
-    fp32_initializers: Dict[str, InitializerTracker] = {}
-    while queue:
-        next_level = []
-        for q in queue:
-            # if q is model, push q.graph (GraphProto)
-            if isinstance(q, onnx_proto.ModelProto):
-                next_level.append(q.graph)
-            # if q is model.graph, push q.node.attribute (AttributeProto)
-            if isinstance(q, onnx_proto.GraphProto):
-                for n in q.initializer:  # TensorProto type
-                    if n.data_type == onnx_proto.TensorProto.FLOAT:
-                        assert n.name not in fp32_initializers
-                        fp32_initializers[n.name] = InitializerTracker(n)
-
-                for n in q.node:
-                    # if n is in the block list (doesn't support float16), no conversion for the node,
-                    # and save the node for further processing
-                    if n.name in io_casts:
-                        continue
-                    for i in range(len(n.input)):
-                        if n.input[i] in name_mapping:
-                            n.input[i] = name_mapping[n.input[i]]
-                    for i in range(len(n.output)):
-                        if n.output[i] in name_mapping:
-                            n.output[i] = name_mapping[n.output[i]]
-
-                    is_node_blocked = (
-                        n.op_type in op_block_list or n.name in node_block_list
-                    )
-                    for input in n.input:
-                        if input in fp32_initializers:
-                            fp32_initializers[input].add_node(n, is_node_blocked)
-
-                    if is_node_blocked:
-                        node_list.append(n)
-                    else:
-                        if n.op_type == "Cast":
-                            for attr in n.attribute:
-                                if attr.name == "to" and attr.i == 1:
-                                    attr.i = 10
-                                    break
-                        for attr in n.attribute:
-                            next_level.append(attr)
-            # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
-            # and process node.attribute.t and node.attribute.tensors (TensorProto)
-            if isinstance(q, onnx_proto.AttributeProto):
-                next_level.append(q.g)
-                for n in q.graphs:
-                    next_level.append(n)
-                q.t.CopyFrom(
-                    convert_tensor_float_to_float16(
-                        q.t, min_positive_val, max_finite_val
-                    )
-                )
-                for n in q.tensors:
-                    n = convert_tensor_float_to_float16(
-                        n, min_positive_val, max_finite_val
-                    )
-            # if q is graph, process input, output and value_info (ValueInfoProto)
-            if isinstance(q, onnx_proto.GraphProto):
-                # Note that float initializers tracked by fp32_initializers will be processed later.
-                # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
-                # tensor(float16) except map and seq(map). And save them in value_info_list for further processing
-                for n in itertools.chain(q.input, q.output, q.value_info):
-                    if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
-                        if n.name not in graph_io_to_skip:
-                            n.type.tensor_type.elem_type = (
-                                onnx_proto.TensorProto.FLOAT16
-                            )
-                            value_info_list.append(n)
-                    if n.type.HasField("sequence_type"):
-                        if (
-                            n.type.sequence_type.elem_type.tensor_type.elem_type
-                            == onnx_proto.TensorProto.FLOAT
-                        ):
-                            if n.name not in graph_io_to_skip:
-                                n.type.sequence_type.elem_type.tensor_type.elem_type = (
-                                    onnx_proto.TensorProto.FLOAT16
-                                )
-                                value_info_list.append(n)
-
-        queue = next_level
-
-    for key, value in fp32_initializers.items():
-        # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
-        if force_fp16_initializers or value.fp16_nodes:
-            value.initializer = convert_tensor_float_to_float16(
-                value.initializer, min_positive_val, max_finite_val
-            )
-            value_info_list.append(make_value_info_from_tensor(value.initializer))
-            if value.fp32_nodes and not force_fp16_initializers:
-                logger.info(
-                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
-                        value.fp16_nodes
-                    )
-                )
-
-    # process the nodes in block list that doesn't support tensor(float16)
-    for node in node_list:
-        # if input's name is in the value_info_list meaning input is tensor(float16) type,
-        # insert a float16 to float Cast node before the node,
-        # change current node's input name and create new value_info for the new name
-        for i in range(len(node.input)):
-            input = node.input[i]
-            for value_info in value_info_list:
-                if input == value_info.name:
-                    # create new value_info for current node's new input name
-                    new_value_info = model.graph.value_info.add()
-                    new_value_info.CopyFrom(value_info)
-                    output_name = node.name + "_input_cast_" + str(i)
-                    new_value_info.name = output_name
-                    new_value_info.type.tensor_type.elem_type = (
-                        onnx_proto.TensorProto.FLOAT
-                    )
-                    # add Cast node (from tensor(float16) to tensor(float) before current node
-                    node_name = node.name + "_input_cast" + str(i)
-                    new_node = [
-                        helper.make_node(
-                            "Cast", [input], [output_name], to=1, name=node_name
-                        )
-                    ]
-                    model.graph.node.extend(new_node)
-                    # change current node's input name
-                    node.input[i] = output_name
-                    break
-        # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
-        # float16 Cast node after the node, change current node's output name and create new value_info for the new name
-        for i in range(len(node.output)):
-            output = node.output[i]
-            for value_info in value_info_list:
-                if output == value_info.name:
-                    # create new value_info for current node's new output
-                    new_value_info = model.graph.value_info.add()
-                    new_value_info.CopyFrom(value_info)
-                    input_name = node.name + "_output_cast_" + str(i)
-                    new_value_info.name = input_name
-                    new_value_info.type.tensor_type.elem_type = (
-                        onnx_proto.TensorProto.FLOAT
-                    )
-                    # add Cast node (from tensor(float) to tensor(float16) after current node
-                    node_name = node.name + "_output_cast" + str(i)
-                    new_node = [
-                        helper.make_node(
-                            "Cast", [input_name], [output], to=10, name=node_name
-                        )
-                    ]
-                    model.graph.node.extend(new_node)
-                    # change current node's input name
-                    node.output[i] = input_name
-                    break
-    return model
-
-
-def float_to_float16_max_diff(
-    tensor, min_positive_val=5.96e-08, max_finite_val=65504.0
-):
-    """Measure the maximum absolute difference after converting a float tensor to float16."""
-    if not isinstance(tensor, onnx_proto.TensorProto):
-        raise ValueError(
-            "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
-        )
-    if tensor.data_type != onnx_proto.TensorProto.FLOAT:
-        raise ValueError("Expected tensor data type is float.")
-
-    float32_data = None
-    if tensor.float_data:
-        float32_data = np.array(tensor.float_data)
-
-    if tensor.raw_data:
-        float32_data = np.frombuffer(tensor.raw_data, dtype="float32")
-
-    if float32_data is None:
-        raise RuntimeError("external data not loaded!")
-
-    float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
-    return np.amax(np.abs(float32_data - np.float32(float16_data)))
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py
deleted file mode 100644
index 9862d9ee4bee8da619750b2544ddc48d35be0fa9..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_inverse_sigmoid.py
+++ /dev/null
@@ -1,85 +0,0 @@
-
-from logging import getLogger
-from typing import Dict
-
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-class FusionLayerInverseSigmoid(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "InverseSigmoid", "Clip"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-                     +------------Clip-----------+
-                     |                           |
-                     |                           v
-        [Root] -->  Clip-->  Sub  --> Clip --> Div --> Log
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        if len(children) != 2:
-            return
-
-        root_input = node.input[0]
-
-        if not ((children[0].op_type == "Sub" and children[1].op_type == "Clip") or (children[0].op_type == "Clip" and children[1].op_type == "Sub")):
-            return
-
-        log_node = None
-        for child in children:
-            log_node = self.model.find_first_child_by_type(
-                child, "Log", input_name_to_nodes, recursive=True
-            )
-            if log_node is not None:
-                break
-        if log_node is None:
-            return
-        parent_nodes = self.model.match_parent_path(
-            log_node,
-            ["Div", "Clip", "Sub", "Clip"],
-            [0, 1, 0, 1],
-            output_name_to_node,
-        )
-        if parent_nodes is None:
-            return
-
-        sub_node = parent_nodes[2]
-        if sub_node not in children:
-            return
-
-        div_node = parent_nodes[0]
-        div_parents_nodes = self.model.get_parents(div_node)
-        if len(div_parents_nodes) != 2:
-            return
-        if div_parents_nodes[0].op_type != "Clip":
-            return
-        if div_parents_nodes[0] not in children:
-            return
-
-        subgraph_nodes = [node]
-        subgraph_nodes.extend([log_node])
-        subgraph_nodes.extend(parent_nodes)
-        subgraph_nodes.extend([div_parents_nodes[0]])
-        _, eps_val = self.model.get_constant_input(div_parents_nodes[0])
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        inverse_sigmoid_node = helper.make_node(
-            "InverseSigmoid",
-            inputs=[node.input[0]],
-            outputs=[log_node.output[0]],
-            name=self.model.create_node_name(
-                "InverseSigmoid", name_prefix="InverseSigmoid"
-            ),
-        )
-        inverse_sigmoid_node.attribute.extend(
-            [helper.make_attribute("epsilon", float(eps_val))]
-        )
-        self.nodes_to_add.append(inverse_sigmoid_node)
-        self.node_name_to_graph_name[inverse_sigmoid_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py
deleted file mode 100644
index bfd1ed28eb8b0f3d7c65b1e31da8c1dc45415ce7..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_l2_normalization.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from logging import getLogger
-from typing import Dict
-
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-class FusionLayerL2Normalization(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "L2Normalization", "Abs"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-            +-------------------------------------------------------+
-            |                                                       |
-            |                                                       v
-        [Root] -->  Abs-->  Pow  --> ReduceSum --> Pow --> Clip --> Div
-        """
-        pow1_nodes = self.model.get_children(node, input_name_to_nodes)
-        if len(pow1_nodes) != 1 or pow1_nodes[0].op_type != "Pow":
-            return
-
-        reduce_nodes = self.model.get_children(pow1_nodes[0], input_name_to_nodes)
-        if len(reduce_nodes) != 1 or reduce_nodes[0].op_type != "ReduceSum":
-            return
-
-        pow2_nodes = self.model.get_children(reduce_nodes[0], input_name_to_nodes)
-        if len(pow2_nodes) != 1 or pow2_nodes[0].op_type != "Pow":
-            return
-
-        clip_nodes = self.model.get_children(pow2_nodes[0], input_name_to_nodes)
-        if len(clip_nodes) != 1 or clip_nodes[0].op_type != "Clip":
-            return
-
-        div_nodes = self.model.get_children(clip_nodes[0], input_name_to_nodes)
-        if len(div_nodes) != 1 or div_nodes[0].op_type != "Div":
-            return
-
-        root_input = node.input[0]
-        if div_nodes[0].input[0] != root_input:
-            return
-
-        subgraph_nodes = [node, pow1_nodes[0], reduce_nodes[0], pow2_nodes[0], clip_nodes[0], div_nodes[0]]
-        _, eps_val = self.model.get_constant_input(clip_nodes[0])
-        _, norm_axes = self.model.get_constant_input(reduce_nodes[0])
-        norm_axes = norm_axes.astype(np.int32)
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        l2_normalization_node = helper.make_node(
-            "L2Normalization",
-            inputs=[node.input[0]],
-            outputs=[div_nodes[0].output[0]],
-            name=self.model.create_node_name(
-                "L2Normalization", name_prefix="L2Normalization"
-            ),
-        )
-        l2_normalization_node.attribute.extend(
-            [helper.make_attribute("epsilon", float(eps_val)), 
-             helper.make_attribute("axes", norm_axes),
-             helper.make_attribute("axes_length", int(norm_axes.size))]
-        )
-        self.nodes_to_add.append(l2_normalization_node)
-        self.node_name_to_graph_name[l2_normalization_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py
deleted file mode 100644
index 3451731f835ef05d8e61e0b5da2ef724be808f17..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_omdet_attention.py
+++ /dev/null
@@ -1,149 +0,0 @@
-
-from logging import getLogger
-from typing import Dict
-
-import math
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-class FusionLayerOmdetAttention(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "CustomQKVToContextPluginDynamic_IxRT", "CustomFCPluginDynamic_IxRT"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        [Root] -->  CustomFCPluginDynamic_IxRT-->  CustomQKVToContextPluginDynamic_IxRT  --> CustomFCPluginDynamic_IxRT
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        parent = self.model.get_parents(node, output_name_to_node)
-        
-        if len(children) != 1:
-            return
-        if len(parent) != 1:
-            return
-
-        fc_first_node = None
-        for par in parent:
-            fc_first_node = self.model.find_first_parent_by_type(
-                par, "CustomFCPluginDynamic_IxRT", output_name_to_node, recursive=True
-            )
-            if fc_first_node is not None:
-                break
-        if fc_first_node is None:
-            return
-        
-        start_node = node
-        
-        # v path
-        v_nodes = self.model.match_parent_path(
-            start_node,
-            ["Reshape", "Transpose", "MatMul", "Gather", "Transpose", "Reshape"],
-            [0, 0, 0, 1, 0, 0],
-            output_name_to_node,
-        )
-        
-        # path1, q and k path
-        q_nodes = self.model.match_parent_path(
-            start_node,
-            ["Reshape", "Transpose", "MatMul", "Softmax", "Add", "MatMul", "Transpose", "Gather", "Transpose", "Reshape"],
-            [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
-            output_name_to_node,
-        )
-        
-        k_nodes = self.model.match_parent_path(
-            start_node,
-            ["Reshape", "Transpose", "MatMul", "Softmax", "Add", "MatMul", "Mul", "Gather", "Transpose", "Reshape"],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-    
-        # path2, q and k path
-        q_nodes_1 = self.model.match_parent_path(
-            start_node,
-            ["Reshape", "Transpose", "MatMul", "Softmax", "Reshape", "Add", "Reshape", "Add", "MatMul", "Transpose", "Gather", "Transpose", "Reshape"],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
-            output_name_to_node,
-        )
-        
-        k_nodes_1 = self.model.match_parent_path(
-            start_node,
-            ["Reshape", "Transpose", "MatMul", "Softmax", "Reshape", "Add", "Reshape", "Add", "MatMul", "Mul", "Gather", "Transpose", "Reshape"],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-        
-        if v_nodes is None:
-            return
-        
-        if v_nodes and q_nodes and k_nodes:
-            subgraph_nodes = []
-            subgraph_nodes.extend(q_nodes)
-            subgraph_nodes.extend(k_nodes)
-            subgraph_nodes.extend(v_nodes)
-            
-            subgraph_nodes_unique = []
-            for item in subgraph_nodes:
-                if item not in subgraph_nodes_unique:
-                    subgraph_nodes_unique.append(item)
-            
-            add_node = q_nodes[4]
-            hidden_size = start_node.attribute[0].i
-            _, mul_val = self.model.get_constant_input(k_nodes[6])
-            num_heads = hidden_size // math.floor((1/mul_val)*(1/ mul_val))
-            attention_input_1_name = add_node.input[1]
-        
-        if v_nodes and q_nodes_1 and k_nodes_1:
-            subgraph_nodes = []
-            subgraph_nodes.extend(q_nodes_1)
-            subgraph_nodes.extend(k_nodes_1)
-            subgraph_nodes.extend(v_nodes)
-            
-            subgraph_nodes_unique = []
-            for item in subgraph_nodes:
-                if item not in subgraph_nodes_unique:
-                    subgraph_nodes_unique.append(item)
-            
-            hidden_size = start_node.attribute[0].i
-            _, mul_val = self.model.get_constant_input(k_nodes_1[9])
-            num_heads = hidden_size // math.floor((1/mul_val)*(1/ mul_val))
-            
-            add_1 = self.model.get_initializer(q_nodes_1[5].input[1], True)
-            add_2 = self.model.get_initializer(q_nodes_1[7].input[1], True)
-            add_all = np.squeeze(add_1 + add_2)
-            
-            attention_input_1_name = "attention_" + q_nodes_1[5].input[1]
-            attention_input_1 = helper.make_tensor(
-                attention_input_1_name, TensorProto.FLOAT, add_all.shape, add_all.flatten().tolist())
-            
-            self.model.add_initializer(attention_input_1, self.this_graph_name)
-            
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=[fc_first_node.output[0], attention_input_1_name],
-            outputs=[start_node.input[0]],
-            name=self.model.create_node_name(
-                "OmdetAttention", name_prefix="OmdetAttention"
-            ),
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
-        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-        
-        self.nodes_to_remove.extend(subgraph_nodes_unique)
-        
-        self.nodes_to_add.append(attention_node)
-        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-        
-        
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
deleted file mode 100644
index bb9a1cab034aaf714b416ea971ac9e6d69884894..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-from logging import getLogger
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSerialBiasAdd(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "Add", "Softmax")
-
-    def match_parent_path_from_dict(self, start_node, path_dict):
-        res_path = None
-        res_nodes = None
-        for k, v in path_dict.items():
-            res_nodes = self.model.match_parent_path(start_node, v[0], v[1])
-            if res_nodes is None:
-                continue
-            return res_nodes, k
-        return res_nodes, res_path
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        paths = {
-            "path1": (["Reshape", "Add", "Reshape", "Add"], [0, 0, 0, 0]),
-        }
-        series_nodes, path_chosen = self.match_parent_path_from_dict(node, paths)
-        if not series_nodes:
-            return
-        last_reshape, add_2nd, _, add_1st = series_nodes
-
-        biases = [
-            self.model.get_initializer(add_1st.input[1]),
-            self.model.get_initializer(add_2nd.input[1]),
-        ]
-        if not all(biases):
-            return
-
-        bias_arr_1st = NumpyHelper.to_array(biases[0])
-        bias_arr_2nd = NumpyHelper.to_array(biases[1]).squeeze(0)
-        try:
-            relative_position_bias = bias_arr_1st + bias_arr_2nd
-        except Exception as e:
-            print("Two bias are unrelated:", e)
-            return
-
-        # Fuse
-        add_name = self.model.create_node_name("Add", "Add")
-        B = biases[0]
-        B.CopyFrom(numpy_helper.from_array(relative_position_bias, B.name))
-
-        fused_node = helper.make_node(
-            "Add",
-            inputs=[add_1st.input[0], B.name],
-            outputs=last_reshape.output,
-            name=add_name,
-        )
-        fused_node.domain = "com.iluvatar"
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-        self.nodes_to_remove.extend(series_nodes)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py
deleted file mode 100644
index 2d4cc73a9dcb1c8d31d778b380bd0e8a13f454e9..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_PVT_attention.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionPVTAttention(Fusion):
-    """
-    Fuse FusionPVTAttention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQkvCrossToContext_IxRT",
-            ["Softmax"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = False
-        self.hidden_size_warning = False
-
-
-    def create_decoder_attention_node(
-        self, inputs: str, outputs: str, type_mask: int, has_mask: int,scale: float
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-
-        attention_node_name = self.model.create_node_name("cross_Attention")
-        attention_node = helper.make_node(
-            "CustomQkvCrossToContext_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("scale", scale)])
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)])
-
-        return attention_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        """
-        path:
-
-         (query) ---------------->MatMul ---->Mul --->softmax --->MatMul--->
-                                    /                             /
-         (key)   ---->Transpose -->                              /
-                                                                /
-                                                               /
-                                                              /
-         (value)--------------------------------------------->
-
-        """
-
-        start_node = node
-        qkv_paths = {
-            "path": (["Mul", "MatMul", "Transpose"], [0, 0, 0]),  # cross attention qery pass
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-        next_nodes = self.model.get_children(node)
-        if len(next_nodes) == 0:
-            return
-
-        if next_nodes[0].op_type != "MatMul":
-            return
-
-        second_matmul_node = next_nodes[0]
-        attention_outputs = second_matmul_node.output
-        remove_nodes = [second_matmul_node, node]
-
-
-
-        (mul_node, first_matmul_node, transpose_node) = qkv_nodes
-        transpose_nodes = self.model.get_parents(first_matmul_node)
-        
-        q_input = transpose_nodes[0].output[0]
-        k_input = transpose_nodes[1].input[0]
-        v_input = second_matmul_node.input[1]
-        attention_inputs = [q_input, k_input, v_input]
-        remove_nodes.extend([first_matmul_node, mul_node, transpose_nodes[1]])
-
-        has_mask = 0
-        type_mask = 4 
-        
-        scale =  numpy_helper.to_array(self.model.get_initializer(mul_node.input[1])).item()                
-        atten_node = self.create_decoder_attention_node(
-            attention_inputs, attention_outputs, type_mask, has_mask,scale
-        )
-        self.nodes_to_add.append(atten_node)
-        self.node_name_to_graph_name[atten_node.name] = self.this_graph_name
-        self.nodes_to_remove.extend(remove_nodes)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
deleted file mode 100644
index a3e31fe7dd164b86cf9e6f4e476bc0b31246e747..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
+++ /dev/null
@@ -1,643 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import List, Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_attention import AttentionMask
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-def get_tensor_attr(attrs, attr_name):
-    result = None
-    for i in attrs:
-        if i.name == attr_name:
-            return numpy_helper.to_array(i.t)
-    return result
-
-
-class FusionAlbertAttention(Fusion):
-    """
-    Fuse Albert subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-        hidden_size: int,
-        num_heads: int,
-        attention_mask: AttentionMask,
-    ):
-        super().__init__(
-            model,
-            "CustomQKVToContextPluginDynamic_IxRT",
-            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
-        )
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.attention_mask = attention_mask
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape_value = self.model.get_constant_value(reshape_q.input[1])
-        if q_shape_value is None:
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(
-                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        num_heads = q_shape_value[2]
-        head_size = q_shape_value[3]
-        hidden_size = num_heads * head_size
-
-        if self.num_heads > 0 and num_heads != self.num_heads:
-            if self.num_heads_warning:
-                logger.warning(
-                    f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
-                )
-                self.num_heads_warning = False  # Do not show the warning more than once
-
-        if self.hidden_size > 0 and hidden_size != self.hidden_size:
-            if self.hidden_size_warning:
-                logger.warning(
-                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
-                )
-                self.hidden_size_warning = (
-                    False  # Do not show the warning more than once
-                )
-
-        return num_heads, hidden_size
-
-    def get_add_qk_str(self, add_qk: NodeProto):
-        shape_infer = self.model.infer_runtime_shape(update=True)
-        if shape_infer is None:
-            return
-
-        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
-        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
-
-        if input_0_shape is None or input_1_shape is None:
-            logger.debug(f"one of the inputs of {add_qk} is None")
-            return None
-
-        if input_0_shape != input_1_shape:
-            logger.debug(f"the shape of two inputs of {add_qk} is not same")
-            return None
-
-        return add_qk.input[1]
-
-    def create_attention_node(
-        self,
-        mask_index: str,
-        q_matmul: NodeProto,
-        k_matmul: NodeProto,
-        v_matmul: NodeProto,
-        q_add: NodeProto,
-        k_add: NodeProto,
-        v_add: NodeProto,
-        num_heads: int,
-        hidden_size: int,
-        input: str,
-        output: str,
-        add_qk_str: str,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            mask_index (str): mask input
-            q_matmul (NodeProto): MatMul node in fully connection for Q
-            k_matmul (NodeProto): MatMul node in fully connection for  K
-            v_matmul (NodeProto): MatMul node in fully connection for  V
-            q_add (NodeProto): Add bias node in fully connection for Q
-            k_add (NodeProto): Add bias node in fully connection for K
-            v_add (NodeProto): Add bias node in fully connection for V
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        q_weight = self.model.get_initializer(q_matmul.input[1])
-        k_weight = self.model.get_initializer(k_matmul.input[1])
-        v_weight = self.model.get_initializer(v_matmul.input[1])
-        q_bias = self.model.get_initializer(
-            q_add.input[1]
-        ) or self.model.get_initializer(q_add.input[0])
-        k_bias = self.model.get_initializer(
-            k_add.input[1]
-        ) or self.model.get_initializer(k_add.input[0])
-        v_bias = self.model.get_initializer(
-            v_add.input[1]
-        ) or self.model.get_initializer(v_add.input[0])
-
-        if q_weight is None:
-            print(
-                f"{q_matmul.input[1]} is not an initializer. "
-                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
-            )
-            return None
-        if not (k_weight and v_weight and q_bias and k_bias):
-            return None
-
-        qw = NumpyHelper.to_array(q_weight)
-        kw = NumpyHelper.to_array(k_weight)
-        vw = NumpyHelper.to_array(v_weight)
-
-        # assert q and k have same shape as expected
-        assert qw.shape == kw.shape
-
-        qw_in_size = qw.shape[0]
-        kw_in_size = kw.shape[0]
-        vw_in_size = vw.shape[0]
-
-        assert qw_in_size == kw_in_size == vw_in_size
-
-        if hidden_size > 0 and hidden_size != qw_in_size:
-            logger.warning(
-                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
-                "Please provide a correct input hidden size or pass in 0"
-            )
-
-        is_qkv_diff_dims = False
-
-        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
-        # For 2d weights, the shapes would be [in_size, out_size].
-        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
-        qw_out_size = np.prod(qw.shape[1:])
-        kw_out_size = np.prod(kw.shape[1:])
-        vw_out_size = np.prod(vw.shape[1:])
-
-        qkv_weight_dim = 0
-        qkv_weight = np.concatenate((qw, kw, vw), axis=1)
-        qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
-
-        qb = NumpyHelper.to_array(q_bias)
-        kb = NumpyHelper.to_array(k_bias)
-        vb = NumpyHelper.to_array(v_bias)
-
-        q_bias_shape = np.prod(qb.shape)
-        k_bias_shape = np.prod(kb.shape)
-        v_bias_shape = np.prod(vb.shape)
-
-        assert q_bias_shape == k_bias_shape == qw_out_size
-        assert v_bias_shape == vw_out_size
-
-        qkv_bias_dim = 0
-        if is_qkv_diff_dims:
-            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
-            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
-        else:
-            qkv_bias = np.stack((qb, kb, vb), axis=0)
-            qkv_bias_dim = 3 * q_bias_shape
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        weight = helper.make_tensor(
-            name=attention_node_name + "_qkv_weight",
-            data_type=TensorProto.FLOAT,
-            dims=[qkv_weight_dim, qw_in_size],
-            vals=qkv_weight.transpose(1, 0).flatten().tolist(),
-        )
-
-        # Sometimes weights and bias are stored in fp16
-        if q_weight.data_type == 10:
-            weight.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
-                )
-            )
-        self.model.add_initializer(weight, self.this_graph_name)
-
-        bias = helper.make_tensor(
-            name=attention_node_name + "_qkv_bias",
-            data_type=TensorProto.FLOAT,
-            dims=[qkv_bias_dim],
-            vals=qkv_bias.flatten().tolist(),
-        )
-        if q_bias.data_type == 10:
-            bias.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
-                )
-            )
-        self.model.add_initializer(bias, self.this_graph_name)
-
-        fc_output_tensor = helper.make_tensor_value_info(
-            attention_node_name + "_input", TensorProto.FLOAT, [None, None, None]
-        )
-        fc_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[input],
-            outputs=[fc_output_tensor.name],
-            name=self.model.create_node_name("AttentionFC", "MatMul_AddBias_"),
-        )
-        fc_node.domain = "com.iluvatar"
-        b = NumpyHelper.to_array(bias)
-        fc_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
-        fc_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        fc_node.attribute.extend([helper.make_attribute("W", weight)])
-        fc_node.attribute.extend([helper.make_attribute("B", bias)])
-        fc_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fc_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fc_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fc_node)
-
-        attention_inputs = [fc_node.output[0]]
-        if mask_index is not None:
-            attention_inputs.append(mask_index)
-        else:
-            attention_inputs.append("")
-
-        if add_qk_str is not None:
-            attention_inputs.append("")
-            attention_inputs.append(add_qk_str)
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-
-        if is_qkv_diff_dims:
-            attention_node.attribute.extend(
-                [
-                    helper.make_attribute(
-                        "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
-                    )
-                ]
-            )
-
-        return attention_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-        if normalize_node.op_type == "LayerNormalization":
-            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
-            if add_before_layernorm is not None:
-                start_node = add_before_layernorm
-            else:
-                return
-
-        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
-        qkv_nodes = self.model.match_parent_path(
-            start_node,
-            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-            [None, None, 0, 0, 0],
-        )
-        if qkv_nodes is None:
-            qkv_nodes = self.model.match_parent_path(
-                start_node,
-                ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                [1, None, 0, 0, 0],
-            )
-        einsum_node = None
-        if qkv_nodes is not None:
-            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
-        else:
-            # Match Albert
-            qkv_nodes = self.model.match_parent_path(
-                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
-            )
-            if qkv_nodes is not None:
-                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
-            else:
-                return
-
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
-
-            if input == qkv_nodes[0].output[0]:
-                continue
-            other_inputs.append(input)
-        if len(other_inputs) != 1:
-            return
-
-        root_input = other_inputs[0]
-        """
-        Match flaubert                     Mask
-                                            |
-        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
-         |                                                        |
-         |                                                        |
-         +---------------------------------------------------------
-        """
-        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
-        if mul_before_layernorm is not None:
-            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
-            if mul_children is not None and len(mul_children) == 2:
-                layernorm_node = mul_children[1]
-                if layernorm_node.op_type == "LayerNormalization":
-                    root_input = layernorm_node.output[0]
-                else:
-                    return
-            elif mul_children is not None and len(mul_children) == 5:
-                root_input = mul_before_layernorm.output[0]
-            else:
-                return
-        elif normalize_node.op_type == "LayerNormalization":
-            children = input_name_to_nodes[root_input]
-            for child in children:
-                if child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        children = input_name_to_nodes[root_input]
-        children_types = [child.op_type for child in children]
-        if children_types.count("MatMul") != 3:
-            return
-
-        v_nodes = self.model.match_parent_path(
-            matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
-        )
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        (_, _, add_v, matmul_v) = v_nodes
-
-        is_distill = False
-        is_distill_add = False
-        is_mul_split = False
-        qk_paths = {
-            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
-            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
-            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
-            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
-            "path5": (["Softmax", "Add", "MatMul"], [0, 0, None])
-        }
-
-        qk_nodes = None
-        for k, v in qk_paths.items():
-            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
-            if qk_nodes is None:
-                continue
-            if k == "path3":
-                is_distill = True
-            if k == "path4":
-                is_distill_add = True
-            if k == "path5":
-                is_mul_split = True
-            break
-
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return
-        add_qk = None
-        matmul_qk = None
-        where_qk = None
-        if is_distill:
-            (_, where_qk, matmul_qk, _) = qk_nodes
-        elif is_distill_add:
-            (_, add_qk, where_qk, matmul_qk) = qk_nodes
-        elif is_mul_split:
-            (_, add_qk, matmul_qk) = qk_nodes
-        else:
-            (_, add_qk, _, matmul_qk) = qk_nodes
-
-        q_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
-        )
-        if q_nodes is None:
-            q_nodes = self.model.match_parent_path(
-                matmul_qk,
-                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
-                [0, 0, 0, 0, None],
-            )
-            if q_nodes is None and is_mul_split:
-                q_nodes = self.model.match_parent_path(
-                    matmul_qk,
-                    ["Mul", "Transpose", "Reshape", "Add", "MatMul"],
-                    [0, 0, 0, 0, None],
-                )
-            if q_nodes is None:
-                logger.debug("fuse_attention: failed to match q path")
-                return
-        reshape_q = q_nodes[-3]
-        add_q = q_nodes[-2]
-        matmul_q = q_nodes[-1]
-
-        k_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
-        )
-        if k_nodes is None:
-            k_nodes = self.model.match_parent_path(
-                matmul_qk,
-                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
-                [1, 0, 0, 0, None],
-            )
-            if k_nodes is None and is_mul_split:
-                k_nodes = self.model.match_parent_path(
-                    matmul_qk,
-                    ["Mul", "Transpose", "Reshape", "Add", "MatMul"],
-                    [1, 0, 0, 0, None],
-                )
-            
-            if k_nodes is None:
-                logger.debug("fuse_attention: failed to match k path")
-                return
-        add_k = k_nodes[-2]
-        matmul_k = k_nodes[-1]
-
-        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
-        mask_nodes = None
-        add_qk_str = None
-        if is_distill:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                where_qk,
-                [
-                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
-                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
-                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
-                ],
-                output_name_to_node,
-            )
-        elif is_distill_add:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                where_qk,
-                [
-                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
-                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
-                ],
-                output_name_to_node,
-            )
-            if add_qk is not None:
-                add_qk_str = self.get_add_qk_str(add_qk)
-                if add_qk_str is None:
-                    logger.debug(
-                        f"fuse_attention: failed to verify shape inference of {add_qk}"
-                    )
-                    return
-        elif is_mul_split:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                add_qk,
-                [
-                    (["Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze"], [None, 0, 0, 1, 0, 0])
-                ],
-                output_name_to_node,
-            )
-        else:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                add_qk,
-                [
-                    (
-                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
-                        [None, 0, 1, 0, 0],
-                    ),
-                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
-                    (["Mul", "Sub", "Cast", "Unsqueeze"], [None, 0, 1, 0]),
-                ],
-                output_name_to_node,
-            )
-        if mask_nodes is None:
-            logger.debug("fuse_attention: failed to match mask path")
-            return
-
-        if (
-            matmul_v.input[0] == root_input
-            and matmul_q.input[0] == root_input
-            and matmul_k.input[0] == root_input
-        ):
-            # mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
-            if mask_nodes[0].op_type == "Mul":
-                mask_val = self.model.get_initializer(mask_nodes[0].input[1])
-                if mask_val is not None:
-                    mask_val_arr = NumpyHelper.to_array(mask_val)
-                    mask_val_arr = np.where(mask_val_arr <= -100, -100, 0.0).astype(
-                        np.float32
-                    )
-                    mask_val.CopyFrom(
-                        numpy_helper.from_array(mask_val_arr, mask_val.name)
-                    )
-            mask_index = mask_nodes[0].output[0]
-
-            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
-
-            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
-            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
-            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
-            new_node = self.create_attention_node(
-                mask_index,
-                matmul_q,
-                matmul_k,
-                matmul_v,
-                add_q,
-                add_k,
-                add_v,
-                q_num_heads,
-                q_hidden_size,
-                root_input,
-                attention_last_node.output[0],
-                add_qk_str,
-            )
-            if new_node is None:
-                return
-
-            self.nodes_to_add.append(new_node)
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-            if einsum_node is not None:
-                unique_index = einsum_node.input[0]
-                new_edge = "edge_modified_" + unique_index
-                shape_tensor = helper.make_tensor(
-                    name="shape_modified_tensor" + unique_index,
-                    data_type=TensorProto.INT64,
-                    dims=[4],
-                    vals=np.int64(
-                        [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
-                    ).tobytes(),
-                    raw=True,
-                )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
-                self.model.add_node(
-                    helper.make_node(
-                        "Reshape",
-                        [attention_last_node.output[0], shape_tensor.name],
-                        [new_edge],
-                        "reshape_modified_" + unique_index,
-                    ),
-                    self.this_graph_name,
-                )
-                einsum_node.input[0] = new_edge
-
-            self.nodes_to_remove.extend(
-                [attention_last_node, transpose_qkv, matmul_qkv]
-            )
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes)
-            self.nodes_to_remove.extend(k_nodes)
-            self.nodes_to_remove.extend(v_nodes)
-
-            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
-            # self.nodes_to_remove.extend(mask_nodes)
-            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
deleted file mode 100644
index 38ddf62986b46b350cdf158eeccfcf1e3602fe0c..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
+++ /dev/null
@@ -1,634 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class AttentionMask:
-    """
-    Fuse Attention subgraph into one Attention node.
-    """
-
-    def __init__(self, model: OnnxModel):
-        self.model = model
-        # A lookup table with mask input as key, and mask index output as value
-        self.mask_indice = {}
-        # A lookup table with mask input as key, and cast (to int32) output as value
-        self.mask_casted = {}
-        self.utils = FusionUtils(model)
-        self.mask_format = AttentionMaskFormat.MaskIndexEnd
-
-    def set_mask_format(self, mask_format: AttentionMaskFormat):
-        self.mask_format = mask_format
-
-    def set_mask_indice(self, mask, mask_index):
-        if mask in self.mask_indice:
-            assert mask_index == self.mask_indice[mask]
-        self.mask_indice[mask] = mask_index
-
-    def get_first_mask(self):
-        assert len(self.mask_indice) > 0
-        return next(iter(self.mask_indice))
-
-    def process_mask(self, input: str) -> str:
-        if self.mask_format == AttentionMaskFormat.NoMask:
-            return None
-
-        if input in self.mask_indice:
-            return self.mask_indice[input]
-
-        # Add cast to convert int64 to int32
-        if self.model.find_graph_input(input):
-            casted, input_name = self.utils.cast_graph_input_to_int32(input)
-        else:
-            input_name, cast_node = self.utils.cast_input_to_int32(input)
-            casted = True
-
-        if casted:
-            self.mask_casted[input] = input_name
-
-        # Attention supports int32 attention mask (2D) since 1.4.0
-        if self.mask_format == AttentionMaskFormat.AttentionMask:
-            self.mask_indice[input] = input_name
-            return input_name
-
-        # Add a mask processing node to convert attention mask to mask index (1D)
-        output_name = self.model.create_node_name("mask_index")
-        mask_index_node = helper.make_node(
-            "ReduceSum",
-            inputs=[input_name],
-            outputs=[output_name],
-            name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
-        )
-        mask_index_node.attribute.extend(
-            [helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)]
-        )
-        self.model.add_node(mask_index_node)
-
-        self.mask_indice[input] = output_name
-        return output_name
-
-
-class FusionAttention(Fusion):
-    """
-    Fuse Attention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-        hidden_size: int,
-        num_heads: int,
-        attention_mask: AttentionMask,
-    ):
-        super().__init__(
-            model, "Attention", ["SkipLayerNormalization", "LayerNormalization"]
-        )
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.attention_mask = attention_mask
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape = self.model.get_initializer(reshape_q.input[1])
-        if q_shape is None:
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        q_shape_value = NumpyHelper.to_array(q_shape)
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(
-                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        num_heads = q_shape_value[2]
-        head_size = q_shape_value[3]
-        hidden_size = num_heads * head_size
-
-        if self.num_heads > 0 and num_heads != self.num_heads:
-            if self.num_heads_warning:
-                logger.warning(
-                    f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
-                )
-                self.num_heads_warning = False  # Do not show the warning more than once
-
-        if self.hidden_size > 0 and hidden_size != self.hidden_size:
-            if self.hidden_size_warning:
-                logger.warning(
-                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
-                )
-                self.hidden_size_warning = (
-                    False  # Do not show the warning more than once
-                )
-
-        return num_heads, hidden_size
-
-    def get_add_qk_str(self, add_qk: NodeProto):
-        shape_infer = self.model.infer_runtime_shape(update=True)
-        if shape_infer is None:
-            return
-
-        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
-        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
-
-        if input_0_shape is None or input_1_shape is None:
-            logger.debug(f"one of the inputs of {add_qk} is None")
-            return None
-
-        if input_0_shape != input_1_shape:
-            logger.debug(f"the shape of two inputs of {add_qk} is not same")
-            return None
-
-        return add_qk.input[1]
-
-    def create_attention_node(
-        self,
-        mask_index: str,
-        q_matmul: NodeProto,
-        k_matmul: NodeProto,
-        v_matmul: NodeProto,
-        q_add: NodeProto,
-        k_add: NodeProto,
-        v_add: NodeProto,
-        num_heads: int,
-        hidden_size: int,
-        input: str,
-        output: str,
-        add_qk_str: str,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            mask_index (str): mask input
-            q_matmul (NodeProto): MatMul node in fully connection for Q
-            k_matmul (NodeProto): MatMul node in fully connection for  K
-            v_matmul (NodeProto): MatMul node in fully connection for  V
-            q_add (NodeProto): Add bias node in fully connection for Q
-            k_add (NodeProto): Add bias node in fully connection for K
-            v_add (NodeProto): Add bias node in fully connection for V
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        q_weight = self.model.get_initializer(q_matmul.input[1])
-        k_weight = self.model.get_initializer(k_matmul.input[1])
-        v_weight = self.model.get_initializer(v_matmul.input[1])
-        q_bias = self.model.get_initializer(
-            q_add.input[1]
-        ) or self.model.get_initializer(q_add.input[0])
-        k_bias = self.model.get_initializer(
-            k_add.input[1]
-        ) or self.model.get_initializer(k_add.input[0])
-        v_bias = self.model.get_initializer(
-            v_add.input[1]
-        ) or self.model.get_initializer(v_add.input[0])
-
-        if q_weight is None:
-            print(
-                f"{q_matmul.input[1]} is not an initializer. "
-                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
-            )
-            return None
-        if not (k_weight and v_weight and q_bias and k_bias):
-            return None
-
-        qw = NumpyHelper.to_array(q_weight)
-        kw = NumpyHelper.to_array(k_weight)
-        vw = NumpyHelper.to_array(v_weight)
-
-        # assert q and k have same shape as expected
-        assert qw.shape == kw.shape
-
-        qw_in_size = qw.shape[0]
-        kw_in_size = kw.shape[0]
-        vw_in_size = vw.shape[0]
-
-        assert qw_in_size == kw_in_size == vw_in_size
-
-        if hidden_size > 0 and hidden_size != qw_in_size:
-            logger.warning(
-                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
-                "Please provide a correct input hidden size or pass in 0"
-            )
-
-        is_qkv_diff_dims = False
-        if qw.shape != vw.shape:
-            is_qkv_diff_dims = True
-
-        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
-        # For 2d weights, the shapes would be [in_size, out_size].
-        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
-        qw_out_size = np.prod(qw.shape[1:])
-        kw_out_size = np.prod(kw.shape[1:])
-        vw_out_size = np.prod(vw.shape[1:])
-
-        qkv_weight_dim = 0
-        if is_qkv_diff_dims:
-            qkv_weight = np.concatenate((qw, kw, vw), axis=1)
-            qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
-        else:
-            qkv_weight = np.stack((qw, kw, vw), axis=1)
-            qkv_weight_dim = 3 * qw_out_size
-
-        qb = NumpyHelper.to_array(q_bias)
-        kb = NumpyHelper.to_array(k_bias)
-        vb = NumpyHelper.to_array(v_bias)
-
-        q_bias_shape = np.prod(qb.shape)
-        k_bias_shape = np.prod(kb.shape)
-        v_bias_shape = np.prod(vb.shape)
-
-        assert q_bias_shape == k_bias_shape == qw_out_size
-        assert v_bias_shape == vw_out_size
-
-        qkv_bias_dim = 0
-        if is_qkv_diff_dims:
-            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
-            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
-        else:
-            qkv_bias = np.stack((qb, kb, vb), axis=0)
-            qkv_bias_dim = 3 * q_bias_shape
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        weight = helper.make_tensor(
-            name=attention_node_name + "_qkv_weight",
-            data_type=TensorProto.FLOAT,
-            dims=[qw_in_size, qkv_weight_dim],
-            vals=qkv_weight.flatten().tolist(),
-        )
-
-        # Sometimes weights and bias are stored in fp16
-        if q_weight.data_type == 10:
-            weight.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
-                )
-            )
-        self.model.add_initializer(weight, self.this_graph_name)
-
-        bias = helper.make_tensor(
-            name=attention_node_name + "_qkv_bias",
-            data_type=TensorProto.FLOAT,
-            dims=[qkv_bias_dim],
-            vals=qkv_bias.flatten().tolist(),
-        )
-        if q_bias.data_type == 10:
-            bias.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
-                )
-            )
-        self.model.add_initializer(bias, self.this_graph_name)
-
-        attention_inputs = [
-            input,
-            attention_node_name + "_qkv_weight",
-            attention_node_name + "_qkv_bias",
-        ]
-        if mask_index is not None:
-            attention_inputs.append(mask_index)
-        else:
-            attention_inputs.append("")
-
-        if add_qk_str is not None:
-            attention_inputs.append("")
-            attention_inputs.append(add_qk_str)
-
-        attention_node = helper.make_node(
-            "Attention",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.microsoft"
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-
-        if is_qkv_diff_dims:
-            attention_node.attribute.extend(
-                [
-                    helper.make_attribute(
-                        "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
-                    )
-                ]
-            )
-
-        return attention_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-        if normalize_node.op_type == "LayerNormalization":
-            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
-            if add_before_layernorm is not None:
-                start_node = add_before_layernorm
-            else:
-                return
-
-        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
-        qkv_nodes = self.model.match_parent_path(
-            start_node,
-            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-            [None, None, 0, 0, 0],
-        )
-        einsum_node = None
-        if qkv_nodes is not None:
-            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
-        else:
-            # Match Albert
-            qkv_nodes = self.model.match_parent_path(
-                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
-            )
-            if qkv_nodes is not None:
-                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
-            else:
-                return
-
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
-
-            if input == qkv_nodes[0].output[0]:
-                continue
-            other_inputs.append(input)
-        if len(other_inputs) != 1:
-            return
-
-        root_input = other_inputs[0]
-        """
-        Match flaubert                     Mask
-                                            |
-        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
-         |                                                        |
-         |                                                        |
-         +---------------------------------------------------------
-        """
-        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
-        if mul_before_layernorm is not None:
-            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
-            if mul_children is not None and len(mul_children) == 2:
-                layernorm_node = mul_children[1]
-                if layernorm_node.op_type == "LayerNormalization":
-                    root_input = layernorm_node.output[0]
-                else:
-                    return
-            elif mul_children is not None and len(mul_children) == 5:
-                root_input = mul_before_layernorm.output[0]
-            else:
-                return
-        elif normalize_node.op_type == "LayerNormalization":
-            children = input_name_to_nodes[root_input]
-            for child in children:
-                if child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        children = input_name_to_nodes[root_input]
-        children_types = [child.op_type for child in children]
-        if children_types.count("MatMul") != 3:
-            return
-
-        v_nodes = self.model.match_parent_path(
-            matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
-        )
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        (_, _, add_v, matmul_v) = v_nodes
-
-        is_distill = False
-        is_distill_add = False
-        qk_paths = {
-            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
-            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
-            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
-            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
-        }
-
-        qk_nodes = None
-        for k, v in qk_paths.items():
-            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
-            if qk_nodes is None:
-                continue
-            if k == "path3":
-                is_distill = True
-            if k == "path4":
-                is_distill_add = True
-            break
-
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return
-
-        add_qk = None
-        matmul_qk = None
-        where_qk = None
-        if is_distill:
-            (_, where_qk, matmul_qk, _) = qk_nodes
-        elif is_distill_add:
-            (_, add_qk, where_qk, matmul_qk) = qk_nodes
-        else:
-            (_, add_qk, _, matmul_qk) = qk_nodes
-
-        q_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
-        )
-        if q_nodes is None:
-            q_nodes = self.model.match_parent_path(
-                matmul_qk,
-                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
-                [0, 0, 0, 0, None],
-            )
-            if q_nodes is None:
-                logger.debug("fuse_attention: failed to match q path")
-                return
-        reshape_q = q_nodes[-3]
-        add_q = q_nodes[-2]
-        matmul_q = q_nodes[-1]
-
-        k_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
-        )
-        if k_nodes is None:
-            k_nodes = self.model.match_parent_path(
-                matmul_qk,
-                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
-                [1, 0, 0, 0, None],
-            )
-            if k_nodes is None:
-                logger.debug("fuse_attention: failed to match k path")
-                return
-        add_k = k_nodes[-2]
-        matmul_k = k_nodes[-1]
-
-        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
-        mask_nodes = None
-        add_qk_str = None
-        if is_distill:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                where_qk,
-                [
-                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
-                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
-                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
-                ],
-                output_name_to_node,
-            )
-        elif is_distill_add:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                where_qk,
-                [
-                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
-                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
-                ],
-                output_name_to_node,
-            )
-            if add_qk is not None:
-                add_qk_str = self.get_add_qk_str(add_qk)
-                if add_qk_str is None:
-                    logger.debug(
-                        f"fuse_attention: failed to verify shape inference of {add_qk}"
-                    )
-                    return
-        else:
-            _, mask_nodes, _ = self.model.match_parent_paths(
-                add_qk,
-                [
-                    (
-                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
-                        [None, 0, 1, 0, 0],
-                    ),
-                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
-                ],
-                output_name_to_node,
-            )
-        if mask_nodes is None:
-            logger.debug("fuse_attention: failed to match mask path")
-            return
-
-        if (
-            matmul_v.input[0] == root_input
-            and matmul_q.input[0] == root_input
-            and matmul_k.input[0] == root_input
-        ):
-            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
-
-            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
-
-            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
-            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
-            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
-            new_node = self.create_attention_node(
-                mask_index,
-                matmul_q,
-                matmul_k,
-                matmul_v,
-                add_q,
-                add_k,
-                add_v,
-                q_num_heads,
-                q_hidden_size,
-                root_input,
-                attention_last_node.output[0],
-                add_qk_str,
-            )
-            if new_node is None:
-                return
-
-            self.nodes_to_add.append(new_node)
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-            if einsum_node is not None:
-                unique_index = einsum_node.input[0]
-                new_edge = "edge_modified_" + unique_index
-                shape_tensor = helper.make_tensor(
-                    name="shape_modified_tensor" + unique_index,
-                    data_type=TensorProto.INT64,
-                    dims=[4],
-                    vals=np.int64(
-                        [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
-                    ).tobytes(),
-                    raw=True,
-                )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
-                self.model.add_node(
-                    helper.make_node(
-                        "Reshape",
-                        [attention_last_node.output[0], shape_tensor.name],
-                        [new_edge],
-                        "reshape_modified_" + unique_index,
-                    ),
-                    self.this_graph_name,
-                )
-                einsum_node.input[0] = new_edge
-
-            self.nodes_to_remove.extend(
-                [attention_last_node, transpose_qkv, matmul_qkv]
-            )
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes)
-            self.nodes_to_remove.extend(k_nodes)
-            self.nodes_to_remove.extend(v_nodes)
-
-            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
-            # self.nodes_to_remove.extend(mask_nodes)
-            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
deleted file mode 100644
index 3732b0f5fab40cbb269f18abdd56286f298a5493..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import List, Union
-
-from onnx import GraphProto
-
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class Fusion:
-    def __init__(
-        self,
-        model: OnnxModel,
-        fused_op_type: str,
-        search_op_types: Union[str, List[str]],
-        description: str = None,
-    ):
-        self.search_op_types: List[str] = (
-            [search_op_types] if isinstance(search_op_types, str) else search_op_types
-        )
-        self.fused_op_type: str = fused_op_type
-        self.description: str = (
-            f"{fused_op_type}({description})" if description else fused_op_type
-        )
-        self.model: OnnxModel = model
-        self.nodes_to_remove: List = []
-        self.nodes_to_add: List = []
-        self.prune_graph: bool = False
-        self.node_name_to_graph_name: dict = {}
-        self.this_graph_name: str = None
-        # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter.
-        self.fused_count: int = 0
-
-    def apply(self):
-        logger.debug(f"start {self.description} fusion...")
-        input_name_to_nodes = self.model.input_name_to_nodes()
-        output_name_to_node = self.model.output_name_to_node()
-
-        # This assumes that two search ops will not be fused at same time!
-        for search_op_type in self.search_op_types:
-            for node in self.model.get_nodes_by_op_type(search_op_type):
-                graph = self.model.get_graph_by_node(node)
-                if graph is None:
-                    raise Exception("Can not find node in any graphs")
-                self.this_graph_name = graph.name
-                self.fuse(node, input_name_to_nodes, output_name_to_node)
-
-        op_list = [node.op_type for node in self.nodes_to_add]
-        count = max(self.fused_count, op_list.count(self.fused_op_type))
-        if count > 0:
-            logger.info(f"Fused {self.description} count: {count}")
-
-        self.model.remove_nodes(self.nodes_to_remove)
-        self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
-
-        if self.prune_graph:
-            self.model.prune_graph()
-        elif self.nodes_to_remove or self.nodes_to_add:
-            self.model.update_graph()
-
-    def match_parent_path_from_dict(
-        self, start_node, path_dict, output_name_to_node=None, return_indice=None
-    ):
-        res_path = None
-        res_nodes = None
-        for k, v in path_dict.items():
-            res_nodes = self.model.match_parent_path(
-                start_node,
-                v[0],
-                v[1],
-                output_name_to_node=output_name_to_node,
-                return_indice=return_indice,
-            )
-            if res_nodes is None:
-                continue
-            return res_nodes, k
-        return res_nodes, res_path
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
deleted file mode 100644
index 045cd99380a7535079d0f9f33322e2879d2074c0..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionBiasGelu(Fusion):
-    def __init__(self, model: OnnxModel, is_fastgelu):
-        if is_fastgelu:
-            super().__init__(model, "FastGelu", "FastGelu", "add bias")
-        else:
-            super().__init__(model, "BiasGelu", "Gelu")
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        gelu_op_type = node.op_type
-        fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
-
-        if len(node.input) != 1:
-            return
-
-        nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
-        if nodes is None:
-            return
-        (add, matmul) = nodes
-
-        bias_weight = None
-        # bias should be one dimension
-        bias_index = -1
-        for i, input in enumerate(add.input):
-            initializer = self.model.get_initializer(input)
-            if initializer is None:
-                continue
-            bias_index = i
-            bias_weight = NumpyHelper.to_array(initializer)
-            break
-        if bias_weight is None:
-            return
-        if len(bias_weight.shape) != 1:
-            return
-
-        subgraph_nodes = [node, add]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-
-        fused_node = helper.make_node(
-            fuse_op_type,
-            inputs=[matmul.output[0], add.input[bias_index]],
-            outputs=node.output,
-            name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
-        )
-        fused_node.domain = "com.microsoft"
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
deleted file mode 100644
index 21161727373b1ceee5362bc2fa0e713f17e899ae..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionConformerAttention(Fusion):
-    """
-    Fuse VideoBertAttention subgraph into one Attention node.
-    """
-
-    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
-        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["Concat"])
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-
-    def get_num_heads_and_hidden_size(
-        self, atten_matmul: NodeProto, div: NodeProto
-    ) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
-        div_initializer = self.model.get_initializer(div.input[1])
-
-        # 检查float_data是否为空
-        if len(div_initializer.float_data) > 0:
-            div_value = div_initializer.float_data[0]
-        else:
-            # 如果float_data为空，尝试其他方式获取数据
-            # 例如，如果数据存储在raw_data中
-            if len(div_initializer.raw_data) > 0:
-                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
-                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
-            else:
-                raise ValueError("Data not found in the div_initializer")
-
-        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
-        head_dim = math.ceil(div_value * div_value)
-        hidden_size = atten_matul_shape_value[0]
-        num_heads = hidden_size // head_dim
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self, num_heads: int, hidden_size: int, inputs: str, outputs: str
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-
-        return attention_node
-
-    def fuse_reshape(self, shape_data_name):
-
-        shape_tensor = helper.make_tensor(
-            name=shape_data_name,
-            data_type=TensorProto.INT64,
-            dims=[3],
-            vals=np.int64([128, -1, self.hidden_size // self.num_heads]).tobytes(),
-            raw=True,
-        )
-        self.model.add_initializer(shape_tensor, self.this_graph_name)
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-
-        paths = {
-            "path": (
-                ["Unsqueeze", "Mul", "Gather", "Shape", "LayerNormalization"],
-                [None, None, None, None, None],
-            ),
-        }
-
-        reshape_nodes, reshape_path = self.match_parent_path_from_dict(
-            start_node, paths
-        )
-        if reshape_nodes is None:
-            return
-
-        self.nodes_to_remove.append(start_node)
-
-        self.nodes_to_remove.extend(reshape_nodes[:-1])
-        self.fuse_reshape(start_node.output[0])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
deleted file mode 100644
index b55c2412b07067d3ebb05cc080be6a3a31902e22..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-import numpy as np
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionConformerXSoftmax(Fusion):
-    """
-    Fuse Where + Softmax + Where into one node: XSoftmax
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "XSoftmax_IxRT", "Softmax")
-
-    def create_xsoftmax_node(
-        self, data_input: str, mask_input: str, output: str
-    ) -> Union[NodeProto, None]:
-        """Create an XSoftmax node.
-
-        Args:
-            data_input (str): data input name
-            mask_input (str): max input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-
-        unique_index = data_input
-        new_edge = "edge_modified_" + unique_index
-        shape_tensor = helper.make_tensor(
-            name="shape_modified_tensor_" + unique_index,
-            data_type=TensorProto.INT64,
-            dims=[4],
-            vals=np.int64(
-                [-1, 8, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
-            ).tobytes(),
-            raw=True,
-        )
-        self.model.add_initializer(shape_tensor, self.this_graph_name)
-        self.model.add_node(
-            helper.make_node(
-                "Reshape",
-                [data_input, shape_tensor.name],
-                [new_edge],
-                "reshape_modified_" + unique_index,
-            ),
-            self.this_graph_name,
-        )
-
-        new_edge2 = "edge_modified2_" + unique_index
-        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
-
-        xsoftmax_node = helper.make_node(
-            "XSoftmax_IxRT",
-            inputs=[new_edge, mask_input],
-            outputs=[new_edge2],
-            name=xsoftmax_node_name,
-        )
-        xsoftmax_node.domain = "com.iluvatar"
-        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
-        xsoftmax_node.attribute.extend([helper.make_attribute("is_conformer", 1)])
-
-        shape_tensor2 = helper.make_tensor(
-            name="shape_modified_tensor2_" + unique_index,
-            data_type=TensorProto.INT64,
-            dims=[3],
-            vals=np.int64(
-                [-1, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
-            ).tobytes(),
-            raw=True,
-        )
-        self.model.add_initializer(shape_tensor2, self.this_graph_name)
-        self.model.add_node(
-            helper.make_node(
-                "Reshape",
-                [new_edge2, shape_tensor2.name],
-                [output],
-                "reshape_modified2_" + unique_index,
-            ),
-            self.this_graph_name,
-        )
-
-        return xsoftmax_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        xsoftmax_paths = {
-            "path": (["Add", "Where", "Reshape", "Expand"], [None, None, None, None]),
-        }
-        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
-            node, xsoftmax_paths
-        )
-
-        if xsoftmax_nodes is None:
-            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
-            return
-        else:
-            (add_node, where_node, reshape_node, expand_node) = xsoftmax_nodes
-
-            mask_input = expand_node.input[0]
-
-            data_output = node.output[0]
-
-            data_input = add_node.input[0]
-            if where_node.output[0] == add_node.input[0]:
-                data_input = add_node.input[1]
-            xsoftmax_node = self.create_xsoftmax_node(
-                data_input, mask_input, data_output
-            )
-
-            self.nodes_to_remove.extend(xsoftmax_nodes)
-            self.nodes_to_add.append(xsoftmax_node)
-            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py
deleted file mode 100644
index 23cdd0c2d0dca61bf66eb1f484e3093f4d7bf0c6..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conv_reformat.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionConvReformat(Fusion):
-    """
-    Fuse FusionPVTAttention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "FuseConvReformat_IxRT",
-            ["Transpose"],
-        )
-
-
-
-    def create_fuse_node(
-        self, inputs: str, outputs: str, before_conv: int, shape_data: list, prefix
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-
-        node_name = self.model.create_node_name(f"FuseConvReformat_{prefix}")
-        node = helper.make_node(
-            "FuseConvReformat_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=node_name,
-        )
-        node.domain = "com.iluvatar"
-
-        node.attribute.extend([helper.make_attribute("before_conv", before_conv)])
-        node.attribute.extend([helper.make_attribute("shape_data", shape_data)])
-        node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        return node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        """
-        eliminate  Transpose(linear->nchw) + Transpose 
-        path: 
-        ----->Transpose ---->Reshape---> conv ----->Reshape ---->Transpose--->
-        
-        to:
-        ----->FuseConvReformat_IxRT---> conv ----->FuseConvReformat_IxRT--->
-        
-        """        
-        start_node = node
-        paths = {
-            "path": (["Reshape", "Conv", "Reshape","Transpose"], [0, 0, 0, 0]),  # cross attention qery pass
-        }
-
-        nodes, path = self.match_parent_path_from_dict(start_node, paths)
-        
-        if nodes is None:
-            logger.debug("FuseConvReformat: failed to match  path")
-            return
-        
-        (reshape_after_node, conv_node, reshape_before_node, tranpose_before_node) = nodes
-
-        perm1 = tranpose_before_node.attribute[0].ints
-        if perm1 !=[0, 2, 1]:
-            return
-        perm2 = start_node.attribute[0].ints
-        if perm2 !=[0, 2, 1]:
-            return
-        
-        before_shape_data  =  numpy_helper.to_array(self.model.get_initializer(reshape_before_node.input[1]))
-        
-        if before_shape_data.shape[0] != 4:
-            return
-        
-        after_shape_data  =  numpy_helper.to_array(self.model.get_initializer(reshape_after_node.input[1]))
-        if after_shape_data.shape[0] != 3:
-            return
-        node1_inputs = tranpose_before_node.input
-        node1_outputs = reshape_before_node.output
-        node1_before_conv = 1
-        
-        new_node1 = self.create_fuse_node(
-            node1_inputs, node1_outputs, node1_before_conv, before_shape_data,"before")
-        
-        
-        node2_inputs = conv_node.output
-        node2_outputs = start_node.output
-        node2_before_conv = 0
-        new_node2 = self.create_fuse_node(
-            node2_inputs, node2_outputs, node2_before_conv, after_shape_data,"after")
-        
-        self.nodes_to_add.append(new_node1)
-        self.nodes_to_add.append(new_node2)
-        self.node_name_to_graph_name[new_node1.name] = self.this_graph_name
-        self.node_name_to_graph_name[new_node2.name] = self.this_graph_name        
-        self.nodes_to_remove.extend([start_node, reshape_after_node,reshape_before_node,tranpose_before_node])
-
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py
deleted file mode 100644
index 5bfa8768e7077fad40b9ef8ff51427db217a5069..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_attention.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-
-class FusionCosyvoiceAttention(Fusion):
-    """
-    Fuse T5Attention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQkvCrossToContext_IxRT",
-            ["Softmax"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape = self.model.get_initializer(reshape_q.input[1])
-        if q_shape is None:
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
-            return [0, 0]
-
-        q_shape_value = NumpyHelper.to_array(q_shape)
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(
-                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return [0, 0]
-
-        num_heads = q_shape_value[2]
-        head_size = q_shape_value[3]
-        hidden_size = num_heads * head_size
-
-        return num_heads, hidden_size
-
-    def create_decoder_attention_node(
-        self, inputs: str, outputs: str, type_mask: int, has_mask: int, scale: float
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-
-        attention_node_name = self.model.create_node_name("decoder_Attention")
-        attention_node = helper.make_node(
-            "CustomQkvCrossToContext_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("scale", scale)])
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)])
-
-        return attention_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        """
-         path1:
-
-         (query) --------------MatMul---Div --> add -->softmax --->MatMul--->
-                                 /             /                    /
-         (key)   ---->Transpose >             /                    /
-                                             /                    /
-         (mask)   ------------------------>                     /
-                                                               /
-         (value)--------------------------------------------->
-         """
-
-
-
-
-        import pdb
-        start_node = node
-        qkv_paths = {
-            "path1": (
-                ["Add", "Div", "MatMul", "Transpose"],
-                [None, 0, None, 1],
-            ),  # float mask self attention,self attention key pass
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-        
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-        next_nodes = self.model.get_children(node)
-    
-        if len(next_nodes) == 0:
-            return
-                
-        if next_nodes[0].op_type != "MatMul":
-            return
-        
-        second_matmul_node = next_nodes[0]
-        attention_inputs = None
-        attention_outputs = second_matmul_node.output
-        remove_nodes = [second_matmul_node, node]
-
-        (add_node, div_node, first_matmul_node, transpose_node) = qkv_nodes
-        transpose_nodes = self.model.get_parents(first_matmul_node)
-        q_input = transpose_nodes[0].output[0]
-        
-        k_transpose_node = transpose_nodes[1]
-        k_transpose_node_perm = k_transpose_node.attribute[0].ints
-        
-        if  k_transpose_node_perm == [0, 2, 3, 1]:  #transpose has bean merge,[0,2,1,3]->[0, 1, 3, 2] = [0, 2, 3, 1]
-            k_input = transpose_nodes[1].output[0]
-            
-            transpose_nodes[1].attribute[0].ints[0] = 0 
-            transpose_nodes[1].attribute[0].ints[1] = 2 
-            transpose_nodes[1].attribute[0].ints[2] = 1 
-            transpose_nodes[1].attribute[0].ints[3] = 3 
-            
-            remove_nodes.extend([add_node, div_node, first_matmul_node])
-            
-        elif k_transpose_node_perm == [0, 1, 3, 2]:
-            k_input = transpose_nodes[1].input[0]
-            remove_nodes.extend([add_node, div_node, first_matmul_node,k_transpose_node])
-            
-        else:
-            return         
-        
-        v_input = second_matmul_node.input[1]
-        attention_inputs = [q_input, k_input, v_input]
-        
-        has_mask = 1
-        type_mask = 3 # float mask
-        
-        mask_input = add_node.input[0]
-        score_out = div_node.output[0]
-        if add_node.input[0] == score_out:
-            mask_input = add_node.input[1]
-        attention_inputs.append(mask_input)
-        
-        scale_data = self.model.get_initializer_input_edges(div_node.name, return_np_array = True)
-        scale = 1.0 / scale_data[0]
-        
-        atten_node = self.create_decoder_attention_node(
-            attention_inputs, attention_outputs, type_mask, has_mask, scale
-        )
-        
-        self.nodes_to_add.append(atten_node)
-        self.node_name_to_graph_name[atten_node.name] = self.this_graph_name
-        self.nodes_to_remove.extend(remove_nodes)
-        
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py
deleted file mode 100755
index d1a1baffd56aba589caa4251d7d841e9715b8f02..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSplitQKV(Fusion):
-    """
-    Fuse FusionSplitQKV
-    """
-
-    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
-        super().__init__(model, "SplitQKV_IxRT", "Split")
-
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-
-    def create_node(
-        self, inputs: list, outputs:list
-    ) -> Union[NodeProto, None]:
-        """Create an create node.
-
-        Args:
-            data_input (str): data input name
-            mask_input (str): max input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        node_name = self.model.create_node_name("SplitQKV_IxRT")
-        
-        
-        k_cache_output = outputs[1]
-        v_cache_output = outputs[2]
-        
-        concat_k_input = k_cache_output + "_k_concat_input"
-        concat_v_input = v_cache_output + "_v_concat_input"
-        
-        plugin_outputs = [outputs[0],concat_k_input,concat_v_input]
-
-        new_node = helper.make_node(
-            "SplitQKV_IxRT",
-            inputs=inputs,
-            outputs=plugin_outputs,
-            name=node_name,
-        )
-        new_node.domain = "com.iluvatar"
-        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        new_node.attribute.extend(
-            [helper.make_attribute("atten_scale", 1.0)]
-        )
-        new_node.attribute.extend(
-            [helper.make_attribute("transpose", 1)]
-        )
-        new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)])
-        new_node.attribute.extend(
-            [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)]
-        )
-        
-        
-        
-        k_concat_node_name = node_name + "_k_concat"
-        v_concat_node_name = node_name + "_v_concat"
-        
-        k_concat_node = helper.make_node(
-            "Identity",
-            inputs=[concat_k_input],
-            outputs=[outputs[1]],
-            name=k_concat_node_name,
-        )
-        
-        v_concat_node = helper.make_node(
-            "Identity",
-            inputs=[concat_v_input],
-            outputs=[outputs[2]],
-            name=v_concat_node_name,
-        )
-        
-        self.model.replace_input_of_all_nodes(outputs[1],concat_k_input)
-        self.model.replace_input_of_all_nodes(outputs[2],concat_v_input)
-        return new_node,k_concat_node,v_concat_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        split_node = node
-        split_data = self.model.get_initializer_input_edges(node.name,return_np_array = True)
-        if split_data[0].shape != (3,):
-            return 
-        if split_data[0][0] != split_data[0][1] and  split_data[0][1] != split_data[0][2]:
-            return
-
-        q_input, k_input, v_input = node.output[0],node.output[1],node.output[2]  
-              
-        q_path_nodes= []
-        k_path_nodes= []
-        v_path_nodes= []
-        
-        reshape_nodes = self.model.get_children(node)
-        
-        for node in reshape_nodes:
-            if node.op_type != "Reshape":
-                return
-        q_reshape_node,k_reshape_node,v_reshape_node =  reshape_nodes[0],reshape_nodes[1],reshape_nodes[2]   
-                    
-        q_path_nodes.append(q_reshape_node)
-        k_path_nodes.append(k_reshape_node)    
-        v_path_nodes.append(v_reshape_node) 
-        
-        q_transpose_nodes = self.model.get_children(q_reshape_node) 
-        k_transpose_nodes = self.model.get_children(k_reshape_node) 
-        v_transpose_nodes = self.model.get_children(v_reshape_node)
-        
-        if  len(q_transpose_nodes)!=1 and  (not k_transpose_nodes) and len(v_transpose_nodes) != 1:
-            return
-        
-        
-        if (q_transpose_nodes[0].attribute[0].ints != [0, 2, 1, 3]) and (v_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]):
-                return 
-        
-        if len(k_transpose_nodes) == 2:
-            if (k_transpose_nodes[0].attribute[0].ints != k_transpose_nodes[1].attribute[0].ints) and (k_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]):
-                return 
-            
-        
-        if len(k_transpose_nodes) == 1:
-            if  (k_transpose_nodes[0].attribute[0].ints !=[0, 2, 1, 3]):
-                return 
-                
-        
-        q_transpose_node = q_transpose_nodes[0]
-        k_transpose_node_0 = k_transpose_nodes[0]
-        v_transpose_node = v_transpose_nodes[0]
-        
-        k_output = k_transpose_node_0.output[0]
-        
-        if len(k_transpose_nodes) == 2:
-            k_transpose_node_1 = k_transpose_nodes[1]
-            next_node = self.model.get_children(k_transpose_node_1)
-            if not next_node:
-                return
-                        
-            self.model.replace_node_input(next_node[0], k_transpose_node_1.output[0], k_transpose_node_0.output[0])
-            
-
-        q_path_nodes.append(q_transpose_node)
-        v_path_nodes.append(v_transpose_node)
-        k_path_nodes.extend(k_transpose_nodes)
-        
-        plugin_inputs = [split_node.input[0]] 
-        plugin_outputs = [q_transpose_node.output[0], k_output,v_transpose_node.output[0]]
-        
-        remove_nodes = [split_node]
-        
-        remove_nodes.extend(q_path_nodes)
-        remove_nodes.extend(k_path_nodes)
-        remove_nodes.extend(v_path_nodes)
-                
-        new_node,k_cache_concat_node, v_cache_concat_node = self.create_node(plugin_inputs, plugin_outputs)
-        
-        self.nodes_to_add.append(new_node)
-        self.nodes_to_add.append(k_cache_concat_node)
-        self.nodes_to_add.append(v_cache_concat_node)
-        
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-        self.node_name_to_graph_name[k_cache_concat_node.name] = self.this_graph_name
-        self.node_name_to_graph_name[v_cache_concat_node.name] = self.this_graph_name
-        self.nodes_to_remove.extend(remove_nodes)
-      
-    
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py
deleted file mode 100644
index 6b1599d4b27cf32c74dc9c294564490ff1e799da..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_cosyvoice_splitQKV_update_KVcache.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionCosyVoiceSplitQKVUpdateKVCache(Fusion):
-    """
-    Fuse FusionSplitQKVUpdateKVCache
-    """
-
-    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
-        super().__init__(
-            model, "SplitQKVUpdateKVCache_IxRT", "Split"
-        )
-
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-
-    def create_node(
-        self,
-        inputs: list,
-        outputs: list,
-    ) -> Union[NodeProto, None]:
-        """Create an XSoftmax node.
-
-        Args:
-            data_input (str): data input name
-            mask_input (str): max input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        node_name = self.model.create_node_name("SplitQKVUpdateKVCache_IxRT")
-        
-        k_cache_output = outputs[1]
-        v_cache_output = outputs[2]
-        
-        concat_k_input = k_cache_output + "_k_concat_input"
-        concat_v_input = v_cache_output + "_v_concat_input"
-        
-        plugin_outputs = [outputs[0],concat_k_input,concat_v_input]
-        
-        new_node = helper.make_node(
-            "SplitQKVUpdateKVCache_IxRT",
-            inputs=inputs,
-            outputs=plugin_outputs,
-            name=node_name,
-        )
-        
-        k_concat_node_name = node_name + "_k_concat"
-        v_concat_node_name = node_name + "_v_concat"
-        
-        k_concat_node = helper.make_node(
-            "Identity",
-            inputs=[concat_k_input],
-            outputs=[outputs[1]],
-            name=k_concat_node_name,
-        )
-        
-
-            
-        v_concat_node = helper.make_node(
-            "Identity",
-            inputs=[concat_v_input],
-            outputs=[outputs[2]],
-            name=v_concat_node_name,
-        )
-        
-
-        
-        
-        
-        
-        new_node.domain = "com.iluvatar"
-        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)])
-        new_node.attribute.extend(
-            [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)]
-        )
-        
-        self.model.replace_input_of_all_nodes(outputs[1],concat_k_input)
-        self.model.replace_input_of_all_nodes(outputs[2],concat_v_input)
-
-        return new_node,k_concat_node,v_concat_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        
-        split_node = node
-        split_data = self.model.get_initializer_input_edges(node.name,return_np_array = True)
-        if split_data[0].shape != (3,):
-            return 
-        if split_data[0][0] != split_data[0][1] and  split_data[0][1] != split_data[0][2]:
-            return
-
-        q_input, k_input, v_input = node.output[0],node.output[1],node.output[2]  
-              
-        q_path_nodes= []
-        k_path_nodes= []
-        v_path_nodes= []
-        
-        reshape_nodes = self.model.get_children(node)
-        
-        for node in reshape_nodes:
-            if node.op_type != "Reshape":
-                return
-        q_reshape_node,k_reshape_node,v_reshape_node =  reshape_nodes[0],reshape_nodes[1],reshape_nodes[2]   
-                    
-        q_path_nodes.append(q_reshape_node)
-        k_path_nodes.append(k_reshape_node)    
-        v_path_nodes.append(v_reshape_node) 
-        
-        q_transpose_nodes = self.model.get_children(q_reshape_node) 
-        k_transpose_nodes = self.model.get_children(k_reshape_node) 
-        v_transpose_nodes = self.model.get_children(v_reshape_node)
-        
-        if  len(q_transpose_nodes)!=1 and len(k_transpose_nodes) != 1 and len(v_transpose_nodes) != 1:
-            return
-        
-        
-        q_transpose_node = q_transpose_nodes[0]
-        
-        k_transpose_node = k_transpose_nodes[0]
-        v_transpose_node = v_transpose_nodes[0]
-        
-        k_path_nodes.append(k_transpose_node)
-        v_path_nodes.append(v_transpose_node)
-        
-        
-        k_concat_nodes = self.model.get_children(k_transpose_node) 
-        v_concat_nodes = self.model.get_children(v_transpose_node)
-                
-        if  len(k_transpose_nodes) != 1 or len(v_transpose_nodes) != 1:
-            return
-        
-        k_concat_node = k_concat_nodes[0]
-        v_concat_node = v_concat_nodes[0]
-        
-        if v_concat_node.attribute[0].i != 2 and k_concat_node.attribute[0].i != 2: #axis = 2
-            return 
-                
-        k_path_nodes.append(k_concat_node)
-        v_path_nodes.append(v_concat_node)
-        
-        k_cache_input = k_concat_node.input[0]
-        if k_transpose_node.output[0] == k_concat_node.input[0]:
-            k_cache_input = k_concat_node.input[1]
-        k_cache_output =  k_concat_node.output[0]   
-        
-        
-        
-        v_cache_input = v_concat_node.input[0]
-        if v_transpose_node.output[0] == v_concat_node.input[0]:
-            v_cache_input = v_concat_node.input[1]
-        v_cache_output =  v_concat_node.output[0]  
-        
-         
-        plugin_inputs = [split_node.input[0],k_cache_input,v_cache_input] 
-        plugin_outputs = [q_transpose_node.output[0], k_cache_output,v_cache_output]
-        remove_nodes = [split_node, q_reshape_node,q_transpose_node]
-        
-        remove_nodes.extend(k_path_nodes)
-        remove_nodes.extend(v_path_nodes)
-        new_node,k_cache_concat_node, v_cache_concat_node= self.create_node(plugin_inputs, plugin_outputs)
-
-        self.nodes_to_add.append(new_node)
-        self.nodes_to_add.append(k_cache_concat_node)
-        self.nodes_to_add.append(v_cache_concat_node)
-        
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-        self.node_name_to_graph_name[k_cache_concat_node.name] = self.this_graph_name
-        self.node_name_to_graph_name[v_cache_concat_node.name] = self.this_graph_name
-        
-        self.nodes_to_remove.extend(remove_nodes)
-       
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
deleted file mode 100644
index c2dd243357fac20057d67551c0d3d9d86b15dc68..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionCustomFCGPT2(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Reshape"], "gpt2")
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        nodes = self.model.match_parent_path(node, ["Gemm", "Reshape"], [0, 0])
-
-        if nodes is None:
-            return False
-
-        (matmul, reshape_before_matmul) = nodes
-
-        matmul_weight = self.model.get_initializer(matmul.input[1])
-        matmul_bias = self.model.get_initializer(matmul.input[2])
-
-        if matmul_weight is None or matmul_bias is None:
-            return False
-
-        w = NumpyHelper.to_array(matmul_weight)
-        b = NumpyHelper.to_array(matmul_bias)
-
-        transB = 0
-        for attr in matmul.attribute:
-            if attr.name == "transB":
-                transB = attr.i
-                break
-
-        trans_matmul_weight = w
-        if transB == 0:
-            trans_matmul_weight = w.transpose(1, 0)
-        if matmul_weight.name not in self.model.initializer_visited.keys():
-            self.model.initializer_visited[matmul_weight.name] = True
-            if matmul_weight.data_type == 10:
-                matmul_weight.CopyFrom(
-                    numpy_helper.from_array(
-                        trans_matmul_weight.astype(np.float16), matmul_weight.name
-                    )
-                )
-            else:
-                matmul_weight.CopyFrom(
-                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
-                )
-
-        if matmul_bias.data_type == 10:
-            matmul_bias.CopyFrom(
-                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
-            )
-        else:
-            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
-
-        fused_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[reshape_before_matmul.input[0]],
-            outputs=node.output,
-            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
-        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-        self.nodes_to_remove.extend([matmul, node, reshape_before_matmul])
-
-
-class FusionCustomFcRoformer(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"], "roformer fc")
-
-        # For model Roformer.
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        if len(node.input) != 2:
-            return False
-
-        fc_paths = {
-            "path1": (["Reshape", "MatMul", "Reshape"], [0, 0, 0]),
-            "path2": (["Reshape", "MatMul", "Reshape"], [1, 0, 0]),
-        }
-
-        nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
-        if nodes is None:
-            return False
-
-        reshape_after_matmul = nodes[0]
-        matmul = nodes[1]
-        reshape_before_matmul = nodes[2]
-
-        reshape_before_shape = None
-        reshape_after_shape = None
-        for value_info in self.model.graph().value_info:
-            if value_info.name == reshape_before_matmul.input[0]:
-                reshape_before_shape = len(value_info.type.tensor_type.shape.dim)
-                break
-        for value_info in self.model.graph().value_info:
-            if value_info.name == reshape_after_matmul.output[0]:
-                reshape_after_shape = len(value_info.type.tensor_type.shape.dim)
-                break
-        if reshape_before_shape != reshape_after_shape:
-            return False
-
-        weight = self.model.get_initializer(matmul.input[1])
-        bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
-            node.input[0]
-        )
-
-        if weight is None or bias is None:
-            return False
-
-        w = NumpyHelper.to_array(weight)
-        w_in_size = w.shape[0]
-        weight_dim = np.prod(w.shape[1:])
-
-        b = NumpyHelper.to_array(bias)
-        bias_dim = np.prod(b.shape)
-        trans_matmul_weight = w.transpose(1, 0)
-        weight.CopyFrom(onnx.numpy_helper.from_array(trans_matmul_weight, weight.name))
-        # Sometimes weights and bias are stored in fp16
-        if weight.data_type == 10:
-            weight.CopyFrom(
-                numpy_helper.from_array(
-                    trans_matmul_weight.astype(np.float16), weight.name
-                )
-            )
-        bias_arr = onnx.numpy_helper.to_array(bias).flatten()
-        bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
-        if bias.data_type == 10:
-            bias.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
-                )
-            )
-
-        fused_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[reshape_before_matmul.input[0]],
-            outputs=node.output,
-            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        fused_node.attribute.extend([helper.make_attribute("W", weight)])
-        fused_node.attribute.extend([helper.make_attribute("B", bias)])
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-
-        self.nodes_to_remove.extend([node])
-        self.nodes_to_remove.extend(nodes)
-        return True
-
-
-class FusionCustomFC(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"])
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        if self.fuse_1(node, input_name_to_nodes, output_name_to_node):
-            return
-
-    def fuse_1(self, node, input_name_to_nodes, output_name_to_node):
-        if len(node.input) != 2:
-            return False
-        nodes = self.model.match_parent_path(node, ["MatMul"], [None])
-
-        if nodes is None:
-            return False
-        matmul = nodes[0]
-
-        matmul_weight = self.model.get_initializer(matmul.input[1])
-        matmul_bias = self.model.get_initializer(
-            node.input[1]
-        ) or self.model.get_initializer(node.input[0])
-
-        if matmul_weight is None or matmul_bias is None:
-            return False
-
-        w = NumpyHelper.to_array(matmul_weight)
-        b = NumpyHelper.to_array(matmul_bias)
-
-        trans_matmul_weight = w.transpose(1, 0)
-        if matmul_weight.name not in self.model.initializer_visited.keys():
-            self.model.initializer_visited[matmul_weight.name] = True
-            if matmul_weight.data_type == 10:
-                matmul_weight.CopyFrom(
-                    numpy_helper.from_array(
-                        trans_matmul_weight.astype(np.float16), matmul_weight.name
-                    )
-                )
-            else:
-                matmul_weight.CopyFrom(
-                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
-                )
-
-        if matmul_bias.data_type == 10:
-            matmul_bias.CopyFrom(
-                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
-            )
-        else:
-            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
-
-        fused_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[matmul.input[0]],
-            outputs=node.output,
-            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
-        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-        self.nodes_to_remove.extend([matmul, node])
-        return True
-
-
-class FusionCustomFCActivation(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model,
-            "CustomFCPluginDynamic_IxRT",
-            ["Gelu", "Relu", "CustomGeluPluginDynamic_IxRT", "Mul"],
-            "with activation",
-        )
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        if node.op_type == "Mul":
-            return_indice = []
-            nodes = self.model.match_parent_path(
-                node,
-                ["Sigmoid", "Mul", "CustomFCPluginDynamic_IxRT"],
-                [None, 0, 0],
-                return_indice=return_indice,
-            )
-            if nodes is None:
-                return
-
-            (sigmoid_node, mul_node, custom_fc_node) = nodes
-            if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
-                return
-
-            activation_type = 20
-            for attr in custom_fc_node.attribute:
-                if attr.name == "act_type":
-                    attr.i = activation_type
-                    break
-
-            custom_fc_node.output[0] = node.output[0]
-            self.nodes_to_add.append(custom_fc_node)
-            self.nodes_to_remove.extend([node, sigmoid_node, mul_node, custom_fc_node])
-            self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
-        else:
-            nodes = self.model.match_parent_path(
-                node, ["CustomFCPluginDynamic_IxRT"], [0]
-            )
-
-            if nodes is None:
-                logger.debug("CustomFCActivation: failed to match fc+gelu/relu path")
-                return
-
-            fc_node = nodes[0]
-            activation_type = 3
-            if node.op_type == "Gelu":
-                activation_type = 3
-            if node.op_type == "Relu":
-                activation_type = 4
-
-            for attr in fc_node.attribute:
-                if attr.name == "act_type":
-                    attr.i = activation_type
-                    break
-
-            fc_node.output[0] = node.output[0]
-            self.nodes_to_add.append(fc_node)
-            self.nodes_to_remove.extend([node, fc_node])
-            self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
-
-
-class FusionConformerCustomFCActivation(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model,
-            "CustomFCPluginDynamic_IxRT",
-            ["Mul"],
-            "with activation",
-        )
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        # return_indice = []
-        nodes = self.model.match_parent_path(
-            node,
-            ["Sigmoid", "CustomFCPluginDynamic_IxRT"],
-            [
-                None,
-                0,
-            ],
-            # return_indice=return_indice,
-        )
-        if nodes is None:
-            return
-        (sigmoid_node, custom_fc_node) = nodes
-        # if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
-        #     return
-        activation_type = 20
-        for attr in custom_fc_node.attribute:
-            if attr.name == "act_type":
-                attr.i = activation_type
-                break
-        custom_fc_node.attribute.extend([helper.make_attribute("swish_alpha", 1.0)])
-        custom_fc_node.output[0] = node.output[0]
-        self.nodes_to_add.append(custom_fc_node)
-        self.nodes_to_remove.extend([node, sigmoid_node, custom_fc_node])
-        self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
-
-
-class FusionTorchvisionVitCustomFC(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["CustomQKVToContextPluginDynamic_IxRT"], "torchvision vit custom_fc",)
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        
-        custom_fc_node_0 = self.model.get_children(node, input_name_to_nodes)
-        transpose_node_0 = self.model.get_children(custom_fc_node_0[0], input_name_to_nodes)
-        
-        if transpose_node_0[0].op_type != "Transpose":
-            return
-        
-        custom_fc_node_0[0].output[0] = transpose_node_0[0].output[0]
-        
-        nodes = self.model.match_parent_path(node, ["CustomFCPluginDynamic_IxRT","Transpose"], [0, 0])
-        if nodes is None:
-            return
-        
-        (custom_fc_node_1, transpose_node_1) = nodes
-        custom_fc_node_1.input[0] = transpose_node_1.input[0]
-        
-        self.nodes_to_add.append(custom_fc_node_1)
-        self.nodes_to_add.append(custom_fc_node_0[0])
-        self.nodes_to_remove.extend([transpose_node_1, custom_fc_node_1, transpose_node_0[0], custom_fc_node_0[0]])
-        self.node_name_to_graph_name[custom_fc_node_1.name] = self.this_graph_name
-        self.node_name_to_graph_name[custom_fc_node_0[0].name] = self.this_graph_name
-        
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
deleted file mode 100644
index 670a767e18e3ccd13d5540c9a415aa3ad8fc7525..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import List, Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionDisentangledAttention(Fusion):
-    """
-    Match Disentangled Attention
-        -------------------------------------------
-                                                  |
-        GatherElements          -->   Add  -->   Add  -->
-                                       |
-        GatherElements --> Transpose  ->
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "DisentangledAttention_IxRT", "Add")
-
-    def create_disentangled_attention_node(
-        self,
-        inputs: List[str],
-        outputs: List[str],
-    ) -> Union[NodeProto, None]:
-        """Create an disentangled attention node.
-
-        Args:
-            inputs List[str]: data input names
-            outputs List[str]: data output names
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        disentangled_attention_node_name = self.model.create_node_name(
-            "DisentangledAttention"
-        )
-
-        disentangled_attention_node = helper.make_node(
-            "DisentangledAttention_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=disentangled_attention_node_name,
-        )
-        disentangled_attention_node.domain = "com.iluvatar"
-        disentangled_attention_node.attribute.extend(
-            [helper.make_attribute("plugin_namespace", "")]
-        )
-        disentangled_attention_node.attribute.extend(
-            [helper.make_attribute("plugin_version", "1")]
-        )
-        disentangled_attention_node.attribute.extend(
-            [helper.make_attribute("factor", 0.1)]
-        )
-        disentangled_attention_node.attribute.extend(
-            [helper.make_attribute("span", 512)]
-        )
-
-        return disentangled_attention_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        disentangled_attention_path1 = {
-            "path": (["Add", "GatherElements", "MatMul"], [None, None, None]),
-        }
-
-        disentangled_attention_path2 = {
-            "path": (
-                ["Add", "Transpose", "GatherElements", "MatMul"],
-                [None, None, None, None],
-            ),
-        }
-
-        nodes1, _ = self.match_parent_path_from_dict(node, disentangled_attention_path1)
-        nodes2, _ = self.match_parent_path_from_dict(node, disentangled_attention_path2)
-
-        if nodes1 is not None and nodes2 is not None:
-            if nodes1[0] == nodes2[0]:
-                (head_add, first_gather, first_matmul) = nodes1
-                (_, transpose, second_gather, second_matmul) = nodes2
-                tail_add = node
-
-                first_input = [i for i in tail_add.input if i != head_add.output[0]][0]
-                second_input = first_matmul.output[0]
-                third_input = second_matmul.output[0]
-                output = tail_add.output[0]
-
-                disentangled_attention_node = self.create_disentangled_attention_node(
-                    [first_input, second_input, third_input], [output]
-                )
-                self.nodes_to_add.append(disentangled_attention_node)
-                self.node_name_to_graph_name[
-                    disentangled_attention_node.name
-                ] = self.this_graph_name
-                self.nodes_to_remove.append(tail_add)
-                self.nodes_to_remove.append(head_add)
-                self.nodes_to_remove.append(first_gather)
-                self.nodes_to_remove.append(transpose)
-                self.nodes_to_remove.append(second_gather)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
deleted file mode 100644
index f46fa2c77da83612a25dd7bde215f20e70845ff7..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
+++ /dev/null
@@ -1,1078 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict, List, Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionEmbedLayerNoMask(Fusion):
-    """
-    Fuse embedding layer into one node (EmbedLayerNormalization).
-    It supports the following model types: BERT, DistilBert, ALBert.
-    """
-
-    def __init__(self, model: OnnxModel, description: str = "no mask"):
-        super().__init__(
-            model,
-            "EmbedLayerNormalization",
-            ["LayerNormalization", "SkipLayerNormalization"],
-            description,
-        )
-        self.utils = FusionUtils(model)
-        self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
-        # The following will be reset in each fuse call of FusionEmbedLayerNormalization
-        self.attention = None
-        self.embed_node = None
-
-    def match_two_gather(
-        self, add: NodeProto
-    ) -> Union[None, Tuple[NodeProto, NodeProto]]:
-        gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
-        if gather_0_path is None:
-            return None
-
-        gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
-        if gather_1_path is None:
-            return None
-
-        return gather_0_path[0], gather_1_path[0]
-
-    def check_attention_subgraph(
-        self,
-        layernorm: NodeProto,
-        input_name_to_nodes: Dict[str, List[NodeProto]],
-        is_distil_bert: bool,
-    ) -> bool:
-        """Check that LayerNormalization has a child of Attention node or subgraph like Attention.
-
-        Args:
-            layernorm (NodeProto): LayerNormalization node
-            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
-            is_distil_bert (bool): whether it is DistilBert or not
-
-        Returns:
-            bool: whether there is Attention node or subgraph like Attention
-        """
-        self.attention = self.model.find_first_child_by_type(
-            layernorm, "Attention", input_name_to_nodes, recursive=False
-        )
-        if self.attention is None:
-            # In case user disables attention fusion, check whether subgraph looks like Attention.
-            if layernorm.output[0] not in input_name_to_nodes:
-                return False
-            children = input_name_to_nodes[layernorm.output[0]]
-
-            # For Albert, there is MatMul+Add after embedding layer before attention.
-            if (
-                len(children) == 1
-                and children[0].op_type == "MatMul"
-                and children[0].output[0] in input_name_to_nodes
-            ):
-                grandchildren = input_name_to_nodes[children[0].output[0]]
-                if (
-                    len(grandchildren) == 1
-                    and grandchildren[0].op_type == "Add"
-                    and grandchildren[0].output[0] in input_name_to_nodes
-                ):
-                    nodes = input_name_to_nodes[grandchildren[0].output[0]]
-                    for node in nodes:
-                        if node.op_type == "Attention":
-                            self.attention = node
-                            return True
-                    children_types = sorted([child.op_type for child in nodes])
-            else:
-                children_types = sorted([child.op_type for child in children])
-
-            # Two Shape nodes might be merged by ORT
-            if is_distil_bert:
-                # SkipLayerNormailization might exist when model has been optimized by ORT first.
-                if (
-                    children_types
-                    != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
-                    and children_types
-                    != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
-                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
-                ):
-                    logger.debug(
-                        "No Attention like subgraph in children of LayerNormalization"
-                    )
-                    return False
-            else:
-                if children_types != [
-                    "Add",
-                    "MatMul",
-                    "MatMul",
-                    "MatMul",
-                ] and children_types != [
-                    "MatMul",
-                    "MatMul",
-                    "MatMul",
-                    "SkipLayerNormalization",
-                ]:
-                    logger.debug(
-                        "No Attention like subgraph in children of LayerNormalization"
-                    )
-                    return False
-        return True
-
-    def match_position_embedding_distilbert(
-        self, position_embedding_gather, input_ids, output_name_to_node
-    ):
-        """  Match position embedding path from input_ids to Gather for DistilBert.
-
-        Pattern is like the following:
-                 (input_ids)
-                      |
-                     Shape
-                       |   \
-                       |    Gather (indices=1)
-                       |       |
-                       |      Cast (optional)
-                       |       |
-                       |      Range (start=0, end=*, delta=1)
-                       |       |
-                       |    Unsqueeze
-                       |    /
-                      Expand
-                        |
-                      Gather
-        """
-        # remove after tests pass
-        path1 = self.model.match_parent_path(
-            position_embedding_gather, ["Expand", "Shape"], [1, 1]
-        )
-        if path1 is None:
-            path1 = self.model.match_parent_path(
-                position_embedding_gather,
-                ["Expand", "Where", "Reshape", "Shape"],
-                [1, 1, 2, 0],
-            )
-            if path1 is None:
-                return False
-
-        expand, shape = path1[0], path1[-1]
-        if shape.input[0] != input_ids:
-            return False
-
-        _, path2, _ = self.model.match_parent_paths(
-            expand,
-            [
-                (["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
-                (["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
-            ],
-            output_name_to_node,
-        )
-        if path2 is None:
-            return False
-
-        range_node = path2[1]
-        if not (
-            self.utils.check_node_input_value(range_node, 0, 0)
-            and self.utils.check_node_input_value(range_node, 2, 1)
-        ):
-            return False
-
-        gather_node = path2[-2]
-        if not (self.utils.check_node_input_value(gather_node, 1, 1)):
-            return False
-
-        shape_node = path2[-1]
-        if shape_node.input[0] != input_ids:
-            return False
-
-        return True
-
-    def match_position_embedding_roberta(
-        self, position_embedding_gather, input_ids, output_name_to_node
-    ):
-        """Match position embedding path from input_ids to Gather for Roberta.
-
-        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
-          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
-                                                |                              ^
-                                                V                              |
-                                                +------------------------------+
-
-        Roberta new pattern from transformers v4.9:
-           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
-                                                |                                           ^
-                                                V                                           |
-                                                +-------------------------------------------+
-
-        start_node = position_embedding_gather
-        start_index = 1
-
-        # match optional Cast node.
-        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
-        if parent is None:
-            return
-        if parent.op_type == "Cast":
-            if OnnxModel.get_node_attribute(parent, "to") != 7:
-                return
-            start_node = parent
-            start_index = 0
-
-        i, path, return_indices = self.model.match_parent_paths(
-            start_node,
-            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
-              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
-            output_name_to_node)
-
-        if path is not None:
-            # constant input of Add shall be 1.
-            i, value = self.model.get_constant_input(path[0])
-            if value != 1:
-                return False
-
-            _, self.padding_word_id = self.model.get_constant_input(path[-1])
-
-            return input_ids == path[-1].input[0]
-        """
-
-        return False
-
-    def match_position_embedding_bert(
-        self, position_embedding_gather, input_ids, output_name_to_node
-    ):
-        """  Match position embedding path from input_ids to Gather for BERT.
-
-        BERT Embedding Layer Pattern:
-                                    (input_ids)
-                                   /         \
-                                 /          Shape
-                                /              |
-                              /              Gather (indices=1)
-                             /                  |
-                            /                  Add (optional, B=0)
-                           /                    |
-                        Gather (segment_ids) Unsqueeze (axes=0)
-                           \        |           |
-                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
-                              \    /            |
-                                Add          Gather
-                                   \       /
-                                      Add
-                                       |
-                                LayerNormalization
-        """
-        path = self.model.match_parent_path(
-            position_embedding_gather,
-            ["Slice", "Unsqueeze"],
-            [1, 2],
-            output_name_to_node,
-        )
-        if path is None:
-            return False
-
-        slice, unsqueeze = path
-        slice_weight = self.model.get_constant_value(slice.input[0])
-        if not (
-            slice_weight is not None
-            and len(slice_weight.shape) == 2
-            and slice_weight.shape[0] == 1
-            and self.utils.check_node_input_value(slice, 1, [0])
-            and self.utils.check_node_input_value(slice, 3, [1])
-            and (
-                len(slice.input) == 4
-                or self.utils.check_node_input_value(slice, 4, [1])
-            )
-        ):
-            return False
-
-        opset_version = self.model.get_opset_version()
-        if opset_version < 13:
-            if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
-                return False
-        else:
-            if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
-                return False
-
-        node = self.model.get_parent(unsqueeze, 0, output_name_to_node)
-        if node is None:
-            return False
-        if node.op_type == "Add":
-            if not self.utils.check_node_input_value(node, 1, 0):
-                return False
-            gather = self.model.get_parent(node, 0, output_name_to_node)
-        else:
-            gather = node
-
-        if gather is None or gather.op_type != "Gather":
-            return False
-        if not (self.utils.check_node_input_value(gather, 1, 1)):
-            return False
-
-        shape = self.model.get_parent(gather, 0, output_name_to_node)
-        if shape is None or shape.op_type != "Shape":
-            return False
-
-        return input_ids == shape.input[0]
-
-    def match_position_embedding(
-        self, position_embedding_gather, input_ids, output_name_to_node
-    ):
-        if self.match_position_embedding_bert(
-            position_embedding_gather, input_ids, output_name_to_node
-        ):
-            return True
-
-        # TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
-        #       related: https://github.com/huggingface/transformers/issues/10736
-        # if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
-        #    return True
-
-        if self.match_position_embedding_distilbert(
-            position_embedding_gather, input_ids, output_name_to_node
-        ):
-            return True
-
-        return False
-
-    def check_embedding(
-        self, word_embedding_gather, segment_embedding_gather, position_embedding_gather
-    ):
-        """Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
-        input_ids = word_embedding_gather.input[1]
-        segment_ids = (
-            segment_embedding_gather.input[1] if segment_embedding_gather else None
-        )
-        position_ids = position_embedding_gather.input[1]
-
-        if self.shape_infer_helper is not None:
-            input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
-            position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
-            assert input_ids_shape and position_ids_shape
-            if not (
-                len(input_ids_shape) == 2
-                and len(position_ids_shape) == 2
-                and input_ids_shape[1] == position_ids_shape[1]
-            ):
-                logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
-                        input_ids_shape, position_ids_shape
-                    )
-                )
-                return False
-
-            if segment_ids and not self.shape_infer_helper.compare_shape(
-                input_ids, segment_ids
-            ):
-                logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
-                        input_ids_shape,
-                        self.shape_infer_helper.get_edge_shape(segment_ids),
-                    )
-                )
-                return False
-
-        word_embedding_table = self.model.get_constant_value(
-            word_embedding_gather.input[0]
-        )
-        if word_embedding_table is None or len(word_embedding_table.shape) != 2:
-            logger.info(
-                "Cannot fuse EmbedLayerNormalization: word embedding table is not expected"
-            )
-            return False
-
-        position_embedding_table = self.model.get_constant_value(
-            position_embedding_gather.input[0]
-        )
-        if (
-            position_embedding_table is None
-            or len(position_embedding_table.shape) != 2
-            or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
-        ):
-            logger.info(
-                "Cannot fuse EmbedLayerNormalization: position embedding table is not expected"
-            )
-            return False
-
-        if segment_ids:
-            segment_embedding_table = self.model.get_constant_value(
-                segment_embedding_gather.input[0]
-            )
-            if (
-                segment_embedding_table is None
-                or len(segment_embedding_table.shape) != 2
-                or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
-            ):
-                logger.info(
-                    "Cannot fuse EmbedLayerNormalization: segment embedding table is not expected"
-                )
-                return False
-
-        # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between.
-        # TODO: use other information (like initializer names) to identify different embedding weights automatically.
-        if word_embedding_table.shape[0] <= position_embedding_table.shape[0]:
-            logger.warning(
-                f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]}"
-            )
-
-        if segment_ids:
-            if word_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
-                logger.warning(
-                    f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
-                )
-
-            if position_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
-                logger.warning(
-                    f"position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
-                )
-
-        return True
-
-    def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]:
-        """Cast a graph input or node input to int32.
-
-        Args:
-            input_name (str): name of graph input or node input
-
-        Returns:
-            A tuple of casted input name and the cast node.
-            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
-            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
-        """
-        input_cast_node = None
-        graph_input = self.model.find_graph_input(input_name)
-        if graph_input is not None:
-            if graph_input.type.tensor_type.elem_type != TensorProto.INT32:
-                int32_output, input_cast_node = self.utils.cast_input_to_int32(
-                    input_name
-                )
-            else:
-                int32_output = input_name
-        else:
-            int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
-
-        return int32_output, input_cast_node
-
-    def create_fused_node(
-        self,
-        input_ids: str,
-        layernorm: NodeProto,
-        word_embedding_gather: NodeProto,
-        position_embedding_gather: NodeProto,
-        segment_embedding_gather: Union[None, NodeProto],
-        position_ids: str = None,
-        embedding_sum_output=False,
-    ):
-        """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
-
-        Args:
-            input_ids (str): input_ids for word embeddings
-            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
-            word_embedding_gather (NodeProto): the Gather node for word embedding
-            position_embedding_gather (NodeProto): the Gather node for position embedding
-            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.
-
-        Returns:
-            NodeProto: the EmbedLayerNormalization node created.
-        """
-        nodes_to_add = []
-        input_ids, _ = self.cast_to_int32(input_ids)
-
-        node_name = self.model.create_node_name("EmbedLayerNormalization")
-
-        if layernorm.op_type == "LayerNormalization":
-            gamma = layernorm.input[1]
-            beta = layernorm.input[2]
-        else:  # SkipLayerNormalization
-            gamma = layernorm.input[2]
-            beta = layernorm.input[3]
-
-        embed_node_inputs = None
-        if segment_embedding_gather is not None:
-            segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
-
-            embed_node_inputs = [
-                input_ids,
-                segment_ids,
-                word_embedding_gather.input[0],
-                position_embedding_gather.input[0],
-                segment_embedding_gather.input[0],
-                gamma,
-                beta,
-            ]
-        else:  # no segment embedding
-            embed_node_inputs = [
-                input_ids,
-                "",
-                word_embedding_gather.input[0],
-                position_embedding_gather.input[0],
-                "",
-                gamma,
-                beta,
-            ]
-
-        if position_ids is not None:
-            # Adding an empty input for mask before position_ids
-            embed_node_inputs.append("")
-            position_ids, _ = self.cast_to_int32(position_ids)
-            embed_node_inputs.append(position_ids)
-
-        embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"]
-        if embedding_sum_output:
-            embed_node_outputs.append(node_name + "_embedding_sum")
-
-        embed_node = helper.make_node(
-            "EmbedLayerNormalization",
-            embed_node_inputs,
-            outputs=embed_node_outputs,
-            name=node_name,
-        )
-
-        embed_node.domain = "com.microsoft"
-
-        # Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
-        for att in layernorm.attribute:
-            if att.name == "epsilon":
-                embed_node.attribute.extend([att])
-
-        # Set default value to 1e-12 if no attribute is found.
-        # OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
-        if len(embed_node.attribute) == 0:
-            embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
-
-        # Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
-        nodes_to_add.append(embed_node)
-        for node in nodes_to_add:
-            self.node_name_to_graph_name[node.name] = self.this_graph_name
-        self.nodes_to_add.extend(nodes_to_add)
-
-        self.embed_node = embed_node
-        return embed_node
-
-    def finish_fusion(self, layernorm, embed_node):
-        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
-        # use prune graph to remove nodes that is not needed
-        self.prune_graph = True
-
-    def is_embedding_sum_needed(self, add_before_layer_norm):
-        """Check that Add before layer norm has an output to add before next layernorm
-
-        Args:
-            add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph
-
-        Returns:
-            bool: whether there is an extra output needed out of embed layer norm node
-        """
-
-        nodes = self.model.get_children(add_before_layer_norm)
-
-        return len(nodes) > 1
-
-    def fuse_gpt2(
-        self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node
-    ):
-        # graph checks
-        # gpt2 has no segment embedding, subgraph pattern is like
-        #     input_ids  position_ids
-        #        |        |
-        #     Gather    Gather
-        #          \   /
-        #           Add _ _ _ _ _
-        #            |           |
-        #    LayerNormalization  |
-        #            |           |
-        #         Attention      |
-        #            |           |
-        #          Matmul        |
-        #            |          /
-        #           Add        /
-        #             \       /
-        #                Add
-        two_gather = self.match_two_gather(add_before_layernorm)
-        if two_gather is None:
-            return False
-
-        add_output = add_before_layernorm.output[0]
-
-        word_embedding_gather, position_embedding_gather = two_gather
-        input_ids = word_embedding_gather.input[1]
-        position_ids = position_embedding_gather.input[1]
-
-        if not self.check_attention_subgraph(
-            layernorm, input_name_to_nodes, is_distil_bert=False
-        ):
-            return False
-
-        if not self.check_embedding(
-            word_embedding_gather, None, position_embedding_gather
-        ):
-            return False
-
-        optional_embedding_sum_output = False
-        if self.is_embedding_sum_needed(add_before_layernorm):
-            optional_embedding_sum_output = True
-
-        # make the fused node
-        embed_node = self.create_fused_node(
-            input_ids,
-            layernorm,
-            word_embedding_gather,
-            position_embedding_gather,
-            None,
-            position_ids,
-            optional_embedding_sum_output,
-        )
-
-        # direct the output to another add too
-        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
-        if optional_embedding_sum_output:
-            self.model.replace_input_of_all_nodes(add_output, embed_node.output[2])
-
-        return True
-
-    def fuse_distilbert(
-        self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node
-    ):
-        """Fuse embedding layer for DistilBert
-        Args:
-            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
-            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
-            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
-            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
-        """
-
-        # DistilBert has no segment embedding, subgraph pattern is like
-        #       input_ids
-        #        |      \
-        #        |     (position_embedding_subgraph)
-        #        |        |
-        #     Gather    Gather
-        #          \   /
-        #           Add
-        #            |
-        #    LayerNormalization
-        two_gather = self.match_two_gather(add_before_layernorm)
-        if two_gather is None:
-            return False
-
-        word_embedding_gather, position_embedding_gather = two_gather
-        input_ids = word_embedding_gather.input[1]
-
-        if not self.check_attention_subgraph(
-            layernorm, input_name_to_nodes, is_distil_bert=True
-        ):
-            return False
-
-        if not self.match_position_embedding(
-            position_embedding_gather, input_ids, output_name_to_node
-        ):
-            return False
-
-        if not self.check_embedding(
-            word_embedding_gather, None, position_embedding_gather
-        ):
-            return False
-
-        embed_node = self.create_fused_node(
-            input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
-        )
-        self.finish_fusion(layernorm, embed_node)
-        return True
-
-    def fuse_bert(
-        self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node
-    ):
-        """Fuse embedding layer for Bert
-        Args:
-            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
-            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
-            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
-            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
-        """
-
-        add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
-        if add_2_gather is None:
-            return False
-
-        two_gather = self.match_two_gather(add_2_gather[0])
-        if two_gather is None:
-            return False
-
-        word_embedding_gather, segment_embedding_gather = two_gather
-
-        input_ids = word_embedding_gather.input[1]
-
-        if not self.check_attention_subgraph(
-            layernorm, input_name_to_nodes, is_distil_bert=False
-        ):
-            return False
-
-        position_embedding_path = self.model.match_parent_path(
-            add_before_layernorm, ["Gather"], [1]
-        )
-        if position_embedding_path is None:
-            return False
-
-        position_embedding_gather = position_embedding_path[0]
-        if not self.match_position_embedding(
-            position_embedding_gather, input_ids, output_name_to_node
-        ):
-            if not self.match_position_embedding(
-                segment_embedding_gather, input_ids, output_name_to_node
-            ):
-                return False
-            # position and segment are switched
-            temp = segment_embedding_gather
-            segment_embedding_gather = position_embedding_gather
-            position_embedding_gather = temp
-
-        if not self.check_embedding(
-            word_embedding_gather, segment_embedding_gather, position_embedding_gather
-        ):
-            return False
-
-        embed_node = self.create_fused_node(
-            input_ids,
-            layernorm,
-            word_embedding_gather,
-            position_embedding_gather,
-            segment_embedding_gather,
-        )
-        self.finish_fusion(layernorm, embed_node)
-        return True
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        if node.op_type == "LayerNormalization":
-            first_add_path = self.model.match_parent_path(node, ["Add"], [0])
-            if first_add_path is None:
-                return
-            add_before_layernorm = first_add_path[0]
-        else:  # SkipLayerNormalization
-            add_before_layernorm = node  # Add is fused into SkipLayerNormalization
-
-        if self.fuse_gpt2(
-            node, add_before_layernorm, input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-        if self.fuse_distilbert(
-            node, add_before_layernorm, input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-        if self.fuse_bert(
-            node, add_before_layernorm, input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-
-class FusionEmbedLayerNormalization(FusionEmbedLayerNoMask):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "with mask")
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        # Reset attention and embed_node so that we know fusion is successful when they are not None.
-        self.attention = None
-        self.embed_node = None
-        super().fuse(node, input_name_to_nodes, output_name_to_node)
-
-        if self.attention and self.embed_node:
-            mask_index = self.attention.input[3]
-            if mask_index in output_name_to_node:
-                node = output_name_to_node[mask_index]
-                if node.op_type == "ReduceSum":
-                    embed_node = self.embed_node
-                    mask_input_name = node.input[0]
-                    self.nodes_to_remove.extend([node])
-                    embed_node.input.append(mask_input_name)
-                    embed_node.output[1] = mask_index
-
-
-class FusionBertEmbedLayerNormalization(Fusion):
-    """
-    Fuse BertEmbedLayerNormalization subgraph into one node.
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "CustomEmbLayerNormPluginDynamic_IxRT", "CustomQKVToContextPluginDynamic_IxRT"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        input -->  CustomEmbLayerNormPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT -->  CustomQKVToContextPluginDynamic_IxRT  --> CustomFCPluginDynamic_IxRT
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        parent = self.model.get_parents(node, output_name_to_node)
-        
-        if len(children) == 0:
-            return
-        if len(parent) == 0:
-            return
-
-        start_node = node
-               
-        # word_embeddings
-        word_embeddings_node = self.model.match_parent_path(
-            start_node,
-            ["CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
-            [0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-
-        # token_type_embeddings
-        token_type_embeddings_node = self.model.match_parent_path(
-            start_node,
-            ["CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
-            [0, 0, 0, 0, 1],
-            output_name_to_node,
-        )
-        
-        # attention_mask
-        attention_mask_node = self.model.match_parent_path(
-            start_node,
-            ["Mul", "Sub", "Cast", "Unsqueeze"],
-            [1, 0, 1, 0],
-            output_name_to_node,
-        )
-        
-        if word_embeddings_node is None or token_type_embeddings_node is None or attention_mask_node is None:
-            return
-        
-        if word_embeddings_node and token_type_embeddings_node and attention_mask_node:
-            subgraph_nodes = []
-            subgraph_nodes.extend(word_embeddings_node)
-            subgraph_nodes.extend(token_type_embeddings_node)
-            subgraph_nodes.extend(attention_mask_node)
-            
-            subgraph_nodes_unique = []
-            for item in subgraph_nodes:
-                if item not in subgraph_nodes_unique:
-                    subgraph_nodes_unique.append(item)
-            subgraph_nodes_remove = []
-            for item in subgraph_nodes_unique:
-                if item.op_type != "CustomFCPluginDynamic_IxRT":
-                    subgraph_nodes_remove.append(item)
-    
-        # input_ids = self.model.get_graph_inputs_excluding_initializers()[0]
-        # token_type_ids = self.model.get_graph_inputs_excluding_initializers()[1]
-        # attention_mask = self.model.get_graph_inputs_excluding_initializers()[2]
-        
-        emblayernorm_out = word_embeddings_node[1].output[0]
-        emblayernorm_out_mask = attention_mask_node[0].output[0]
-        
-        # self.model.modify_node_output_type(emblayernorm_out_mask, 5)
-    
-        beta_data = self.model.get_initializer(word_embeddings_node[1].input[2], True)
-        embeddings_layernorm_beta_name = "bert_embeddings_layernorm_beta"
-        embeddings_layernorm_beta = helper.make_tensor(
-            embeddings_layernorm_beta_name, TensorProto.FLOAT, beta_data.shape, beta_data.flatten().tolist())
-        
-        gamma_data = self.model.get_initializer(word_embeddings_node[1].input[1], True)
-        embeddings_layernorm_gamma_name = "bert_embeddings_layernorm_gamma"
-        embeddings_layernorm_gamma = helper.make_tensor(
-            embeddings_layernorm_gamma_name, TensorProto.FLOAT, gamma_data.shape, gamma_data.flatten().tolist())
-        
-        embeddings_word_embeddings_data = self.model.get_initializer(word_embeddings_node[4].input[0], True)
-        embeddings_word_embeddings_name = "bert_embeddings_word_embeddings"
-        embeddings_word_embeddings = helper.make_tensor(
-            embeddings_word_embeddings_name, TensorProto.FLOAT, embeddings_word_embeddings_data.shape, 
-            embeddings_word_embeddings_data.flatten().tolist())
-        
-        embeddings_token_type_embeddings_data = self.model.get_initializer(token_type_embeddings_node[4].input[0], True)
-        embeddings_token_type_embeddings_name = "bert_embeddings_token_type_embeddings"
-        embeddings_token_type_embeddings = helper.make_tensor(
-            embeddings_token_type_embeddings_name, TensorProto.FLOAT, embeddings_token_type_embeddings_data.shape, 
-            embeddings_token_type_embeddings_data.flatten().tolist())
-        
-        embeddings_position_embeddings_data = self.model.get_initializer(token_type_embeddings_node[2].input[1], True)
-        embeddings_position_embeddings_name = "bert_embeddings_token_type_embeddings"
-        embeddings_position_embeddings = helper.make_tensor(
-            embeddings_position_embeddings_name, TensorProto.FLOAT, embeddings_position_embeddings_data.shape, 
-            embeddings_position_embeddings_data.flatten().tolist())
-        
-        self.model.add_initializer(embeddings_layernorm_beta, self.this_graph_name)
-        self.model.add_initializer(embeddings_layernorm_gamma, self.this_graph_name)
-        self.model.add_initializer(embeddings_word_embeddings, self.this_graph_name)
-        self.model.add_initializer(embeddings_token_type_embeddings, self.this_graph_name)
-        self.model.add_initializer(embeddings_position_embeddings, self.this_graph_name)
-        
-
-        emblayernorm_node = helper.make_node(
-            "CustomEmbLayerNormPluginDynamic_IxRT",
-            inputs=[word_embeddings_node[4].input[1], token_type_embeddings_node[4].input[1], attention_mask_node[3].input[0]],
-            outputs=[emblayernorm_out, emblayernorm_out_mask],
-            name=self.model.create_node_name(
-                "BertEmbedLayerNormalization", name_prefix="BertEmbedLayerNormalization"
-            ),
-        )
-        emblayernorm_node.domain = "com.iluvatar"
-        emblayernorm_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        emblayernorm_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        emblayernorm_node.attribute.extend([helper.make_attribute("output_fp16", 1)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("full_mask", 1)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("mha_type_id", 2)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("pad_id", 0)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_beta", embeddings_layernorm_beta)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_gamma", embeddings_layernorm_gamma)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_word_embeddings", embeddings_word_embeddings)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_token_type_embeddings", embeddings_token_type_embeddings)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_position_embeddings", embeddings_position_embeddings)])
-        
-        self.nodes_to_remove.extend(subgraph_nodes_remove)
-        
-        self.nodes_to_add.append(emblayernorm_node)
-        self.node_name_to_graph_name[emblayernorm_node.name] = self.this_graph_name
-
-
-class FusionAlbertEmbedLayerNormalization(Fusion):
-    """
-    Fuse AlbertEmbedLayerNormalization subgraph into one node.
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "CustomEmbLayerNormPluginDynamic_IxRT", "CustomQKVToContextPluginDynamic_IxRT"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        input -->  CustomEmbLayerNormPluginDynamic_IxRT --> CustomFCPluginDynamic_IxRT -->  CustomFCPluginDynamic_IxRT --> CustomQKVToContextPluginDynamic_IxRT  --> CustomFCPluginDynamic_IxRT
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        parent = self.model.get_parents(node, output_name_to_node)
-        
-        if len(children) == 0:
-            return
-        if len(parent) == 0:
-            return
-
-        start_node = node
-               
-        # word_embeddings
-        word_embeddings_node = self.model.match_parent_path(
-            start_node,
-            ["CustomFCPluginDynamic_IxRT","CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
-            [0, 0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-
-        # token_type_embeddings
-        token_type_embeddings_node = self.model.match_parent_path(
-            start_node,
-            ["CustomFCPluginDynamic_IxRT","CustomFCPluginDynamic_IxRT", "LayerNormalization", "Add", "Add", "Gather"],
-            [0, 0, 0, 0, 0, 1],
-            output_name_to_node,
-        )
-        
-        # attention_mask
-        attention_mask_node = self.model.match_parent_path(
-            start_node,
-            ["Mul", "Sub", "Cast", "Unsqueeze"],
-            [1, 0, 1, 0],
-            output_name_to_node,
-        )
-        
-        if word_embeddings_node is None or token_type_embeddings_node is None or attention_mask_node is None:
-            return
-        
-        if word_embeddings_node and token_type_embeddings_node and attention_mask_node:
-            subgraph_nodes = []
-            subgraph_nodes.extend(word_embeddings_node)
-            subgraph_nodes.extend(token_type_embeddings_node)
-            subgraph_nodes.extend(attention_mask_node)
-            
-            subgraph_nodes_unique = []
-            for item in subgraph_nodes:
-                if item not in subgraph_nodes_unique:
-                    subgraph_nodes_unique.append(item)
-            subgraph_nodes_remove = []
-            for item in subgraph_nodes_unique:
-                if item.op_type != "CustomFCPluginDynamic_IxRT":
-                    subgraph_nodes_remove.append(item)
-        
-        # input_ids = self.model.get_graph_inputs_excluding_initializers()[0]
-        # token_type_ids = self.model.get_graph_inputs_excluding_initializers()[1]
-        # attention_mask = self.model.get_graph_inputs_excluding_initializers()[2]
-        
-        emblayernorm_out = word_embeddings_node[2].output[0]
-        emblayernorm_out_mask = attention_mask_node[0].output[0]
-        
-        beta_data = self.model.get_initializer(word_embeddings_node[2].input[2], True)
-        embeddings_layernorm_beta_name = "bert_embeddings_layernorm_beta"
-        embeddings_layernorm_beta = helper.make_tensor(
-            embeddings_layernorm_beta_name, TensorProto.FLOAT, beta_data.shape, beta_data.flatten().tolist())
-        
-        gamma_data = self.model.get_initializer(word_embeddings_node[2].input[1], True)
-        embeddings_layernorm_gamma_name = "bert_embeddings_layernorm_gamma"
-        embeddings_layernorm_gamma = helper.make_tensor(
-            embeddings_layernorm_gamma_name, TensorProto.FLOAT, gamma_data.shape, gamma_data.flatten().tolist())
-        
-        embeddings_word_embeddings_data = self.model.get_initializer(word_embeddings_node[5].input[0], True)
-        embeddings_word_embeddings_name = "bert_embeddings_word_embeddings"
-        embeddings_word_embeddings = helper.make_tensor(
-            embeddings_word_embeddings_name, TensorProto.FLOAT, embeddings_word_embeddings_data.shape, 
-            embeddings_word_embeddings_data.flatten().tolist())
-        
-        embeddings_token_type_embeddings_data = self.model.get_initializer(token_type_embeddings_node[5].input[0], True)
-        embeddings_token_type_embeddings_name = "bert_embeddings_token_type_embeddings"
-        embeddings_token_type_embeddings = helper.make_tensor(
-            embeddings_token_type_embeddings_name, TensorProto.FLOAT, embeddings_token_type_embeddings_data.shape, 
-            embeddings_token_type_embeddings_data.flatten().tolist())
-        
-        embeddings_position_embeddings_data = self.model.get_initializer(token_type_embeddings_node[3].input[1], True)
-        embeddings_position_embeddings_name = "bert_embeddings_token_type_embeddings"
-        embeddings_position_embeddings = helper.make_tensor(
-            embeddings_position_embeddings_name, TensorProto.FLOAT, embeddings_position_embeddings_data.shape, 
-            embeddings_position_embeddings_data.flatten().tolist())
-        
-        self.model.add_initializer(embeddings_layernorm_beta, self.this_graph_name)
-        self.model.add_initializer(embeddings_layernorm_gamma, self.this_graph_name)
-        self.model.add_initializer(embeddings_word_embeddings, self.this_graph_name)
-        self.model.add_initializer(embeddings_token_type_embeddings, self.this_graph_name)
-        self.model.add_initializer(embeddings_position_embeddings, self.this_graph_name)
-        
-        emblayernorm_node = helper.make_node(
-            "CustomEmbLayerNormPluginDynamic_IxRT",
-            inputs=[word_embeddings_node[5].input[1], token_type_embeddings_node[5].input[1], attention_mask_node[3].input[0]],
-            outputs=[emblayernorm_out, emblayernorm_out_mask],
-            name=self.model.create_node_name(
-                "BertEmbedLayerNormalization", name_prefix="BertEmbedLayerNormalization"
-            ),
-        )
-        emblayernorm_node.domain = "com.iluvatar"
-        emblayernorm_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        emblayernorm_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        emblayernorm_node.attribute.extend([helper.make_attribute("output_fp16", 1)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("full_mask", 1)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("mha_type_id", 2)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("pad_id", 0)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_beta", embeddings_layernorm_beta)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_layernorm_gamma", embeddings_layernorm_gamma)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_word_embeddings", embeddings_word_embeddings)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_token_type_embeddings", embeddings_token_type_embeddings)])
-        emblayernorm_node.attribute.extend([helper.make_attribute("bert_embeddings_position_embeddings", embeddings_position_embeddings)])
-        
-        self.nodes_to_remove.extend(subgraph_nodes_remove)
-        
-        self.nodes_to_add.append(emblayernorm_node)
-        self.node_name_to_graph_name[emblayernorm_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
deleted file mode 100644
index 067ff26e4eb51ea0df3ad6b49318179afd3b4177..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict, Optional
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionFastGelu(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "CustomGeluPluginDynamic_IxRT", "Tanh")
-
-    def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node):
-            return
-
-        if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node):
-            return
-
-        if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node):
-            return
-
-    def fuse_1(
-        self, tanh_node, input_name_to_nodes, output_name_to_node
-    ) -> Optional[bool]:
-        """
-        Fuse Gelu with tanh into one node:
-              +---------------------------+
-              |                           |
-              |                           v
-            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul
-              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)     ^
-              |                                                              |
-              +------> Mul(B=0.5)--------------------------------------------+
-        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
-        """
-        if tanh_node.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[tanh_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_tanh = children[0]
-
-        if not self.model.has_constant_input(add_after_tanh, 1.0):
-            return
-
-        if add_after_tanh.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_tanh.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_after_tanh = children[0]
-
-        mul_half = self.model.match_parent(
-            mul_after_tanh, "Mul", None, output_name_to_node
-        )
-        if mul_half is None:
-            return
-
-        i = self.model.find_constant_input(mul_half, 0.5)
-        if i < 0:
-            return
-
-        root_input = mul_half.input[0 if i == 1 else 1]
-
-        # root_node could be None when root_input is graph input
-        root_node = self.model.get_parent(
-            mul_half, 0 if i == 1 else 1, output_name_to_node
-        )
-
-        mul_before_tanh = self.model.match_parent(
-            tanh_node, "Mul", 0, output_name_to_node
-        )
-        if mul_before_tanh is None:
-            return
-
-        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
-        if i < 0:
-            return
-
-        add_before_tanh = self.model.match_parent(
-            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
-        )
-        if add_before_tanh is None:
-            return
-
-        mul_after_pow = self.model.match_parent(
-            add_before_tanh,
-            "Mul",
-            None,
-            output_name_to_node,
-            exclude=[root_node] if root_node else [],
-        )
-        if mul_after_pow is None:
-            return
-
-        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
-        if i < 0:
-            return
-
-        pow = self.model.match_parent(
-            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
-        )
-        if pow is None:
-            return
-
-        if not self.model.has_constant_input(pow, 3.0):
-            return
-
-        if pow.input[0] != root_input:
-            return
-
-        subgraph_nodes = [
-            mul_after_tanh,
-            mul_half,
-            add_after_tanh,
-            tanh_node,
-            mul_before_tanh,
-            add_before_tanh,
-            mul_after_pow,
-            pow,
-        ]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            [mul_after_tanh.output[0]],
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "CustomGeluPluginDynamic_IxRT",
-            inputs=[root_input],
-            outputs=mul_after_tanh.output,
-            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
-
-    def fuse_2(
-        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
-    ) -> Optional[bool]:
-        """
-        This pattern is from Tensorflow model.
-        Fuse Gelu with tanh into one node:
-              +---------------------------+
-              |                           |
-              |                           v
-            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul-->
-              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)                  ^
-              |                                                                           |
-              +---------------------------------------------------------------------------+
-        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
-        """
-        if tanh_node.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[tanh_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_tanh = children[0]
-
-        if not self.model.has_constant_input(add_after_tanh, 1.0):
-            return
-
-        if add_after_tanh.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_tanh.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_half = children[0]
-
-        i = self.model.find_constant_input(mul_half, 0.5)
-        if i < 0:
-            return
-
-        if mul_half.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[mul_half.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_after_mul_half = children[0]
-
-        root_node = self.model.get_parent(
-            mul_after_mul_half,
-            0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
-            output_name_to_node,
-        )
-        if root_node is None:
-            return
-
-        mul_before_tanh = self.model.match_parent(
-            tanh_node, "Mul", 0, output_name_to_node
-        )
-        if mul_before_tanh is None:
-            return
-
-        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
-        if i < 0:
-            return
-
-        add_before_tanh = self.model.match_parent(
-            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
-        )
-        if add_before_tanh is None:
-            return
-
-        mul_after_pow = self.model.match_parent(
-            add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node]
-        )
-        if mul_after_pow is None:
-            return
-
-        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
-        if i < 0:
-            return
-
-        pow = self.model.match_parent(
-            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
-        )
-        if pow is None:
-            return
-
-        if not self.model.has_constant_input(pow, 3.0):
-            return
-
-        if pow.input[0] != root_node.output[0]:
-            return
-
-        subgraph_nodes = [
-            mul_after_mul_half,
-            mul_half,
-            add_after_tanh,
-            tanh_node,
-            mul_before_tanh,
-            add_before_tanh,
-            mul_after_pow,
-            pow,
-        ]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            [mul_after_mul_half.output[0]],
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "CustomGeluPluginDynamic_IxRT",
-            inputs=[root_node.output[0]],
-            outputs=mul_after_mul_half.output,
-            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
-
-    def fuse_3(
-        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
-    ) -> Optional[bool]:
-        """
-        OpenAI's gelu implementation, also used in Megatron:
-           Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
-
-        Fuse subgraph into a FastGelu node:
-            +------------ Mul (B=0.79788456) -------------------+
-            |                                                   |
-            +-------------------------------+                   |
-            |                               |                   |
-            |                               v                   v
-          [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
-            |                                                                                 ^
-            |                                                                                 |
-            +-----------> Mul (B=0.5) --------------------------------------------------------+
-        """
-        if tanh_node.output[0] not in input_name_to_nodes:
-            return
-
-        children = input_name_to_nodes[tanh_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_tanh = children[0]
-
-        if not self.model.has_constant_input(add_after_tanh, 1.0):
-            return
-
-        if add_after_tanh.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_tanh.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_last = children[0]
-
-        mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
-        if mul_half is None:
-            return
-
-        i = self.model.find_constant_input(mul_half, 0.5)
-        if i < 0:
-            return
-
-        root_input = mul_half.input[0 if i == 1 else 1]
-
-        mul_before_tanh = self.model.match_parent(
-            tanh_node, "Mul", 0, output_name_to_node
-        )
-        if mul_before_tanh is None:
-            return
-
-        add_1 = self.model.match_parent(
-            mul_before_tanh, "Add", None, output_name_to_node
-        )
-        if add_1 is None:
-            return
-        j = self.model.find_constant_input(add_1, 1.0)
-        if j < 0:
-            return
-
-        mul_7978 = self.model.match_parent(
-            mul_before_tanh, "Mul", None, output_name_to_node
-        )
-        if mul_7978 is None:
-            return
-        k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
-        if k < 0:
-            return
-        if mul_7978.input[0 if k == 1 else 1] != root_input:
-            return
-
-        mul_before_add_1 = self.model.match_parent(
-            add_1, "Mul", 0 if j == 1 else 1, output_name_to_node
-        )
-        if mul_before_add_1 is None:
-            return
-
-        if mul_before_add_1.input[0] == root_input:
-            another = 1
-        elif mul_before_add_1.input[1] == root_input:
-            another = 0
-        else:
-            return
-
-        mul_0447 = self.model.match_parent(
-            mul_before_add_1, "Mul", another, output_name_to_node
-        )
-        if mul_0447 is None:
-            return
-        m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
-        if m < 0:
-            return
-
-        if mul_0447.input[0 if m == 1 else 1] != root_input:
-            return
-
-        subgraph_nodes = [
-            mul_0447,
-            mul_before_add_1,
-            add_1,
-            mul_before_tanh,
-            tanh_node,
-            add_after_tanh,
-            mul_7978,
-            mul_half,
-            mul_last,
-        ]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            [mul_last.output[0]],
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "CustomGeluPluginDynamic_IxRT",
-            inputs=[root_input],
-            outputs=mul_last.output,
-            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
deleted file mode 100644
index 1f60ab7628f1d700042cf1e025df5bb22fc1d641..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionRemoveUselessElementwise(Fusion):
-    """
-    Fusion to remove useless elementwise in roformer model.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(model, "Sqrt", "Sqrt")
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        paths = {
-            "path1": (
-                ["Max", "Min", "Add", "GlobalAveragePool"],
-                [None, None, None, None],
-            ),
-        }
-
-        pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths)
-
-        if pool_nodes is None:
-            logger.debug("GlobalAveragePool: failed searching path after pool node.")
-            return
-
-        max_node = pool_nodes[0]
-        min_node = pool_nodes[1]
-        add_node = pool_nodes[2]
-        pool_node = pool_nodes[3]
-        if not self.model.has_constant_input(add_node, 9.999999960041972e-13):
-            return
-
-        if not self.model.has_constant_input(max_node, 0):
-            return
-
-        max_node.input[0] = pool_node.output[0]
-        self.nodes_to_remove.extend([min_node, add_node])
-
-
-class FusionFormatInvalidMask(Fusion):
-    """
-    Fusion to format invalid mask in roformer model.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(model, "", ["Greater"])
-
-    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
-        nodes = self.model.match_parent_path(
-            start_node,
-            [
-                "ReduceMin",
-                "Cast",
-                "Concat",
-                "Unsqueeze",
-                "Greater",
-                "ReduceMin",
-                "Cast",
-                "Concat",
-                "Unsqueeze",
-            ],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0],
-        )
-
-        if nodes is None:
-            logger.debug("Roformer: unable to format the mask.")
-            return
-
-        unsqueeze_node = nodes[-1]
-
-        for node in self.model.graph().node:
-            for (id, input) in enumerate(node.input):
-                if start_node.output[0] == input:
-                    node.input[id] = unsqueeze_node.input[0]
-
-        self.nodes_to_remove.extend(nodes)
-        self.nodes_to_remove.extend([start_node])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
deleted file mode 100644
index 714212664e452ad7a42daa3623185d973e4bb773..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict, Optional
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionGelu(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "Gelu", "Erf")
-
-    def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node):
-            return
-        if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node):
-            return
-        if self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node):
-            return
-        self.fuse_4(erf_node, input_name_to_nodes, output_name_to_node)
-
-    def fuse_1(
-        self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
-    ) -> Optional[bool]:
-        """
-        This pattern is from PyTorch model
-        Fuse Gelu with Erf into one node:
-        Pattern 1:
-                       +-------Mul(0.5)---------------------+
-                       |                                    |
-                       |                                    v
-                    [root] --> Div -----> Erf  --> Add --> Mul -->
-                              (B=1.4142...)       (1)
-
-        Pattern 2:
-                       +------------------------------------+
-                       |                                    |
-                       |                                    v
-                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
-                              (B=1.4142...)       (1)            (0.5)
-
-        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
-        """
-        if erf_node.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[erf_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_erf = children[0]
-
-        if not self.model.has_constant_input(add_after_erf, 1):
-            return
-
-        if add_after_erf.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_erf.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_after_erf = children[0]
-
-        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
-        if div is None:
-            return
-
-        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
-            return
-
-        subgraph_input = div.input[0]
-
-        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
-        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
-            children = input_name_to_nodes[mul_after_erf.output[0]]
-            if len(children) != 1 or children[0].op_type != "Mul":
-                return
-            mul_half = children[0]
-            if not self.model.has_constant_input(mul_half, 0.5):
-                return
-            subgraph_output = mul_half.output[0]
-        else:  # pattern 1
-            mul_half = self.model.match_parent(
-                mul_after_erf, "Mul", another, output_name_to_node
-            )
-            if mul_half is None:
-                return
-
-            if not self.model.has_constant_input(mul_half, 0.5):
-                return
-
-            if subgraph_input not in mul_half.input:
-                return
-
-            subgraph_output = mul_after_erf.output[0]
-
-        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "Gelu", inputs=[subgraph_input], outputs=[subgraph_output]
-        )
-        fused_node.domain = "com.microsoft"
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
-
-    def fuse_2(
-        self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
-    ) -> Optional[bool]:
-        """
-        This pattern is from Keras model
-        Fuse Gelu with Erf into one node:
-                       +------------------------------------------+
-                       |                                          |
-                       |                                          v
-                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
-                              (B=1.4142...)       (A=1)   (A=0.5)
-
-        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
-        """
-        if erf_node.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[erf_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_erf = children[0]
-
-        if not self.model.has_constant_input(add_after_erf, 1):
-            return
-
-        if add_after_erf.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_erf.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_after_erf = children[0]
-
-        if not self.model.has_constant_input(mul_after_erf, 0.5):
-            return
-
-        if mul_after_erf.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[mul_after_erf.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul = children[0]
-
-        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
-        if div is None:
-            return
-
-        sqrt_node = None
-        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
-            sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
-            if sqrt_node is None:
-                return
-            if not self.model.has_constant_input(sqrt_node, 2.0):
-                return
-
-        root_node = self.model.get_parent(div, 0, output_name_to_node)
-        if root_node is None:
-            return
-
-        if root_node.output[0] not in mul.input:
-            return
-
-        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
-        if sqrt_node:
-            subgraph_nodes.append(sqrt_node)
-
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]]
-        )
-        fused_node.domain = "com.microsoft"
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
-
-    def fuse_3(
-        self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
-    ) -> Optional[bool]:
-        """
-        This pattern is from TensorFlow model
-        Fuse Gelu with Erf into one node:
-                       +----------------------------------------------+
-                       |                                              |
-                       |                                              v
-                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
-                               (A=0.7071067690849304)  (B=1)  (B=0.5)
-
-        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
-        """
-
-        if erf_node.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[erf_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_erf = children[0]
-
-        if not self.model.has_constant_input(add_after_erf, 1):
-            return
-
-        if add_after_erf.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_erf.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_half = children[0]
-
-        if not self.model.has_constant_input(mul_half, 0.5):
-            return
-
-        first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
-        if first_mul is None:
-            return
-
-        i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
-        if i < 0:
-            return
-
-        root_node = self.model.get_parent(
-            first_mul, 0 if i == 1 else 1, output_name_to_node
-        )
-        if root_node is None:
-            return
-
-        if mul_half.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[mul_half.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        last_mul = children[0]
-
-        if not (
-            last_mul.input[0] == root_node.output[0]
-            or last_mul.input[1] == root_node.output[0]
-        ):
-            return
-
-        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            [last_mul.output[0]],
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]]
-        )
-        fused_node.domain = "com.microsoft"
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
-
-    def fuse_4(
-        self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict
-    ) -> Optional[bool]:
-        """
-        This pattern is from TensorFlow model
-        Fuse Gelu with Erf into one node:
-        Pattern 1:
-                       +-------Mul(0.5)---------------------+
-                       |                                    |
-                       |                                    v
-                    [root] --> Mul -----> Erf  --> Add --> Mul -->
-                              (B=0.7071...)       (1)
-
-        Pattern 2:
-                       +------------------------------------+
-                       |                                    |
-                       |                                    v
-                    [root] --> Mul -----> Erf  --> Add --> Mul -->Mul -->
-                              (B=0.7071...)       (1)            (0.5)
-
-        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
-        """
-        if erf_node.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[erf_node.output[0]]
-        if len(children) != 1 or children[0].op_type != "Add":
-            return
-        add_after_erf = children[0]
-
-        if not self.model.has_constant_input(add_after_erf, 1):
-            return
-
-        if add_after_erf.output[0] not in input_name_to_nodes:
-            return
-        children = input_name_to_nodes[add_after_erf.output[0]]
-        if len(children) != 1 or children[0].op_type != "Mul":
-            return
-        mul_after_erf = children[0]
-
-        mul_before_erf = self.model.match_parent(
-            erf_node, "Mul", 0, output_name_to_node
-        )
-        if mul_before_erf is None:
-            return
-
-        if self.model.find_constant_input(mul_before_erf, 0.7071, delta=0.001) != 1:
-            return
-
-        subgraph_input = mul_before_erf.input[0]
-
-        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
-        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
-            children = input_name_to_nodes[mul_after_erf.output[0]]
-            if len(children) != 1 or children[0].op_type != "Mul":
-                return
-            mul_half = children[0]
-            if not self.model.has_constant_input(mul_half, 0.5):
-                return
-            subgraph_output = mul_half.output[0]
-        else:  # pattern 1
-            mul_half = self.model.match_parent(
-                mul_after_erf, "Mul", another, output_name_to_node
-            )
-            if mul_half is None:
-                return
-
-            if not self.model.has_constant_input(mul_half, 0.5):
-                return
-
-            if subgraph_input not in mul_half.input:
-                return
-
-            subgraph_output = mul_after_erf.output[0]
-
-        subgraph_nodes = [
-            mul_before_erf,
-            erf_node,
-            add_after_erf,
-            mul_after_erf,
-            mul_half,
-        ]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = helper.make_node(
-            "Gelu", inputs=[subgraph_input], outputs=[subgraph_output]
-        )
-        fused_node.domain = "com.microsoft"
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
deleted file mode 100644
index a89e558cb76aa8208e4a19983f038e9f3584ffdb..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-
-class FusionGeluApproximation(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        new_node = helper.make_node(
-            "FastGelu",
-            inputs=node.input,
-            outputs=node.output,
-            name=self.model.create_node_name(
-                "FastGelu", node.op_type + "_Approximation"
-            ),
-        )
-        new_node.domain = "com.microsoft"
-        self.nodes_to_remove.append(node)
-        self.nodes_to_add.append(new_node)
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
deleted file mode 100644
index 805cd3bf7dfbf337a633eaa583d14833cdf86282..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-
-import numpy as np
-from onnx import TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionGptAttentionPastBase(Fusion):
-    """Base class for GPT Attention Fusion with past state"""
-
-    def __init__(self, model: OnnxModel, num_heads: int):
-        super().__init__(model, "Attention", "LayerNormalization", "with past")
-        self.num_heads = num_heads
-        self.utils = FusionUtils(model)
-        self.casted_attention_mask = (
-            {}
-        )  # map from name of attention mask to the name that casted to int32
-
-    def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
-        # Pattern 1:
-        #                      {past}
-        #                    /        \
-        #                   /          \
-        #    Gather(axes=0, indices=0)  Gather(indices=1)
-        #      |                          |
-        #    Transpose (perm=0,1,3,2)     |
-        #      |                          |
-        #  Concat_k                     Concat_v
-        #      |                        /
-        #  Transpose (perm=0,1,3,2)    /
-        #      |                      /
-        #  Unsqueeze        Unsqueeze
-        #        \        /
-        #         \      /
-        #           Concat
-        #             |
-        #         {present}
-        gather = self.model.get_parent(concat_v, 0, output_name_to_node)
-        if gather.op_type != "Gather":
-            logger.debug("match_past_pattern_1: expect Gather for past")
-            return None
-
-        if not self.model.find_constant_input(gather, 1) == 1:
-            logger.debug("match_past_pattern_1: expect indices=1 for Gather of past")
-            return None
-        past = gather.input[0]
-
-        parent = self.model.get_parent(concat_k, 0, output_name_to_node)
-        if parent.op_type == "Gather":
-            gather_past_k = parent
-        else:
-            past_k_nodes = self.model.match_parent_path(
-                concat_k, ["Transpose", "Gather"], [0, 0]
-            )
-            if past_k_nodes is None:
-                logger.debug("match_past_pattern_1: failed match Transpose and Gather")
-                return None
-            gather_past_k = past_k_nodes[-1]
-
-        if not self.model.find_constant_input(gather_past_k, 0) == 1:
-            logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past")
-            return None
-        past_k = gather_past_k.input[0]
-        if past != past_k:
-            logger.debug("match_past_pattern_1: expect past to be same")
-            return None
-
-        return past
-
-    def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node):
-        # Pattern 2:
-        #      Split (QKV)
-        #      / |   |
-        #     /  |   +----------------------+
-        #        |                          |
-        #        |         {past}           |
-        #        |           |              |
-        #      Reshape     Split         Reshape
-        #        |         /    \           |
-        # Transpose_k  Squeeze  Squeeze  Transpose_v
-        #        |      |        \        /
-        #        +------|---+     \      /
-        #               |   |      \    /
-        #              Concat_k   Concat_v
-        #               |            |
-        #          Unsqueeze    Unsqueeze
-        #                \       /
-        #                 Concat
-        #                   |
-        #               {present}
-        #
-        squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
-        if squeeze.op_type != "Squeeze":
-            logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
-            return None
-
-        split = self.model.get_parent(squeeze, 0, output_name_to_node)
-        if split.op_type != "Split":
-            logger.debug("match_past_pattern_2: expect Split for past path")
-            return None
-
-        opset_version = self.model.get_opset_version()
-        if opset_version < 13:
-            if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
-                logger.debug(
-                    "match_past_pattern_2: axes != [0] for Squeeze in past path"
-                )
-                return None
-
-            if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
-                logger.debug(
-                    "match_past_pattern_2: split != [1, 1] for Split in past path"
-                )
-                return None
-        else:
-            if not self.utils.check_node_input_value(squeeze, 1, [0]):
-                logger.debug(
-                    "match_past_pattern_2: axes != [0] for Squeeze in past path"
-                )
-                return None
-
-            if not self.utils.check_node_input_value(split, 1, [1, 1]):
-                logger.debug(
-                    "match_past_pattern_2: split != [1, 1] for Split in past path"
-                )
-                return None
-
-        if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
-            logger.debug(
-                "match_past_pattern_2: attribute axis of Split are not expected in past path"
-            )
-            return None
-        past = split.input[0]
-
-        past_k_nodes = self.model.match_parent_path(
-            concat_k, ["Squeeze", "Split"], [0, 0]
-        )
-        if past_k_nodes is None:
-            logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
-            return None
-        past_k = past_k_nodes[-1].input[0]
-
-        if past != past_k:
-            logger.info("match_past_pattern_2: expect past to be same")
-            return None
-
-        return past
-
-    def match_present(self, concat_v, input_name_to_nodes):
-        unsqueeze_present_v = self.model.find_first_child_by_type(
-            concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
-        )
-        if not unsqueeze_present_v:
-            logger.info("expect unsqueeze for present")
-            return None
-        concat_present = self.model.find_first_child_by_type(
-            unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
-        )
-        if not concat_present:
-            logger.info("expect concat for present")
-            return None
-
-        present = concat_present.output[0]
-        return present
-
-    def cast_attention_mask(self, input_name):
-        if input_name in self.casted_attention_mask:
-            attention_mask_input_name = self.casted_attention_mask[input_name]
-        elif self.model.find_graph_input(input_name):
-            casted, attention_mask_input_name = self.utils.cast_graph_input_to_int32(
-                input_name
-            )
-            self.casted_attention_mask[input_name] = attention_mask_input_name
-        else:
-            attention_mask_input_name, cast_node = self.utils.cast_input_to_int32(
-                input_name
-            )
-            self.casted_attention_mask[input_name] = attention_mask_input_name
-        return attention_mask_input_name
-
-
-class FusionGptAttention(FusionGptAttentionPastBase):
-    """
-    Fuse GPT-2 Attention with past state subgraph into one Attention node.
-    """
-
-    def __init__(self, model: OnnxModel, num_heads: int):
-        super().__init__(model, num_heads)
-
-    def create_attention_node(
-        self,
-        fc_weight,
-        fc_bias,
-        gemm_qkv,
-        past,
-        present,
-        input,
-        output,
-        mask,
-        is_unidirectional,
-    ):
-        attention_node_name = self.model.create_node_name("GptAttention")
-        attention_node = helper.make_node(
-            "Attention",
-            inputs=[input, fc_weight, fc_bias, mask, past],
-            outputs=[attention_node_name + "_output", present],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.microsoft"
-        attention_node.attribute.extend(
-            [
-                helper.make_attribute("num_heads", self.num_heads),
-                helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
-            ]
-        )
-
-        matmul_node = helper.make_node(
-            "MatMul",
-            inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
-            outputs=[attention_node_name + "_matmul_output"],
-            name=attention_node_name + "_matmul",
-        )
-
-        add_node = helper.make_node(
-            "Add",
-            inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
-            outputs=[output],
-            name=attention_node_name + "_add",
-        )
-        self.nodes_to_add.extend([attention_node, matmul_node, add_node])
-        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-        self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
-        self.node_name_to_graph_name[add_node.name] = self.this_graph_name
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        past = None
-        present = None
-        return_indice = []
-        qkv_nodes = self.model.match_parent_path(
-            normalize_node,
-            ["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
-            [0, None, 0, 0, 0, 0, 0],
-            output_name_to_node=output_name_to_node,
-            return_indice=return_indice,
-        )  # yapf: disable
-        if qkv_nodes is None:
-            return
-        (
-            add_qkv,
-            reshape_qkv,
-            gemm_qkv,
-            reshape_1,
-            reshape_2,
-            transpose_qkv,
-            matmul_qkv,
-        ) = qkv_nodes
-
-        another_input = add_qkv.input[1 - return_indice[0]]
-
-        v_nodes = self.model.match_parent_path(
-            matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0]
-        )
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        (concat_v, transpose_v, reshape_v, split_fc) = v_nodes
-
-        fc_nodes = self.model.match_parent_path(
-            split_fc,
-            ["Reshape", "Gemm", "Reshape", "LayerNormalization"],
-            [0, 0, 0, 0],
-            output_name_to_node,
-        )
-        if fc_nodes is None:
-            fc_nodes = self.model.match_parent_path(
-                split_fc,
-                ["Add", "MatMul", "LayerNormalization"],
-                [0, None, 0],
-                output_name_to_node,
-            )
-            if fc_nodes is None:
-                logger.debug("fuse_attention: failed to match fc path")
-                return
-            fc_weight = fc_nodes[1].input[1]
-            i, _ = self.model.get_constant_input(fc_nodes[0])
-            fc_bias = fc_nodes[0].input[i]
-        else:
-            fc_weight = fc_nodes[1].input[1]
-            fc_bias = fc_nodes[1].input[2]
-
-        layernorm_before_attention = fc_nodes[-1]
-
-        if not another_input in layernorm_before_attention.input:
-            logger.debug("Add and LayerNormalization shall have one same input")
-            return
-
-        is_unidirectional = True
-        slice_mask = None
-        input_mask_nodes = None
-        concat_k_to_match = None
-        qk_nodes = self.model.match_parent_path(
-            matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0]
-        )
-        if qk_nodes is not None:
-            (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
-            mask_nodes = self.model.match_parent_path(
-                sub_qk,
-                [
-                    "Mul",
-                    "Sub",
-                    "Slice",
-                    "Slice",
-                    "Unsqueeze",
-                    "Sub",
-                    "Squeeze",
-                    "Slice",
-                    "Shape",
-                    "Div",
-                ],
-                [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
-            )  # yapf: disable
-            if mask_nodes is None:
-                logger.debug("fuse_attention: failed to match unidirectional mask path")
-                return
-            div_mask = mask_nodes[-1]
-            slice_mask = mask_nodes[3]
-
-            if div_qk != div_mask:
-                logger.debug("fuse_attention: skip since div_qk != div_mask")
-                return
-        else:
-            # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
-            i, qk_nodes, _ = self.model.match_parent_paths(
-                matmul_qkv,
-                [
-                    (["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
-                    (["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
-                ],
-                output_name_to_node,
-            )
-            if qk_nodes is None:
-                logger.debug("fuse_attention: failed to match qk nodes")
-                return
-
-            where_qk = qk_nodes[-3]
-            div_qk = qk_nodes[-2]
-            matmul_qk = qk_nodes[-1]
-
-            if i == 1:
-                add_qk = qk_nodes[1]
-                _, input_mask_nodes, _ = self.model.match_parent_paths(
-                    add_qk,
-                    [
-                        (
-                            ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
-                            [None, 0, 1, 0, 0, 0],
-                        ),
-                        (
-                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
-                            [None, 0, 1, 0, 0],
-                        ),
-                        (
-                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
-                            [None, 0, 1, 0],
-                        ),  # useless cast and reshape are removed.
-                    ],
-                    output_name_to_node,
-                )  # yapf: disable
-                if input_mask_nodes is None:
-                    logger.debug(
-                        "fuse_attention: failed to match input attention mask path"
-                    )
-                    return
-
-            mask_nodes = self.model.match_parent_path(
-                where_qk,
-                [
-                    "Cast",
-                    "Slice",
-                    "Slice",
-                    "Unsqueeze",
-                    "Sub",
-                    "Squeeze",
-                    "Slice",
-                    "Shape",
-                ],
-                [0, 0, 0, 1, 0, 0, 0, 0],
-                output_name_to_node,
-            )  # yapf: disable
-            if mask_nodes is None:
-                # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
-                logger.debug("fuse_attention: failed to match mask path")
-                return
-
-            slice_mask = mask_nodes[2]
-
-            div_or_concat = self.model.get_parent(
-                mask_nodes[-1], 0, output_name_to_node
-            )
-            if div_or_concat.op_type == "Div":
-                div_mask = div_or_concat
-                if div_qk != div_mask:
-                    logger.debug("fuse_attention: skip since div_qk != div_mask")
-                    return
-            elif div_or_concat.op_type == "Concat":
-                concat_k_to_match = div_or_concat
-            else:
-                logger.debug("fuse_attention: failed to match mask path")
-
-        # Validate that the mask data is either lower triangular (unidirectional) or all ones
-        mask_data = numpy_helper.to_array(
-            self.model.get_initializer(slice_mask.input[0])
-        )
-        if not (
-            len(mask_data.shape) == 4
-            and mask_data.shape[:2] == (1, 1)
-            and mask_data.shape[2] == mask_data.shape[3]
-        ):
-            logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
-            return
-        if np.allclose(mask_data, np.ones_like(mask_data)):
-            is_unidirectional = False
-        elif not np.allclose(mask_data, np.tril(np.ones_like(mask_data))):
-            logger.debug(
-                "fuse_attention: skip since mask is neither lower triangular nor ones"
-            )
-            return
-
-        q_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
-        )
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-        (transpose_q, reshape_q, split_q) = q_nodes
-        if split_fc != split_q:
-            logger.debug("fuse_attention: skip since split_fc != split_q")
-            return
-
-        k_nodes = self.model.match_parent_path(
-            matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0]
-        )
-        if k_nodes is None:
-            # This pattern is from pytorch 1.7.1 and transformers 4.6.1
-            k_nodes = self.model.match_parent_path(
-                matmul_qk,
-                ["Transpose", "Concat", "Transpose", "Reshape", "Split"],
-                [1, 0, 1, 0, 0],
-            )
-            if k_nodes is None:
-                logger.debug("fuse_attention: failed to match k path")
-                return
-            else:
-                (_, concat_k, transpose_k, reshape_k, split_k) = k_nodes
-        else:
-            (concat_k, transpose_k, reshape_k, split_k) = k_nodes
-        if split_fc != split_k:
-            logger.debug("fuse_attention: skip since split_fc != split_k")
-            return
-
-        if concat_k_to_match and concat_k != concat_k_to_match:
-            logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
-            return
-
-        attention_mask_input_name = ""
-        if input_mask_nodes is not None:
-            input_name = input_mask_nodes[-1].input[0]
-            attention_mask_input_name = self.cast_attention_mask(input_name)
-
-        # Match past and present paths
-        past = self.match_past_pattern_1(
-            concat_k, concat_v, output_name_to_node
-        ) or self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
-        if past is None:
-            logger.info("fuse_attention: failed to match past path")
-            return
-        if not self.model.find_graph_input(past):
-            logger.debug("past is not graph input.")
-            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
-
-        present = self.match_present(concat_v, input_name_to_nodes)
-        if present is None:
-            logger.info("fuse_attention: failed to match present path")
-            return
-        if not self.model.find_graph_output(present):
-            logger.info("expect present to be graph output")
-            return
-
-        self.create_attention_node(
-            fc_weight,
-            fc_bias,
-            gemm_qkv,
-            past,
-            present,
-            layernorm_before_attention.output[0],
-            reshape_qkv.output[0],
-            attention_mask_input_name,
-            is_unidirectional,
-        )
-
-        # we rely on prune_graph() to clean old subgraph nodes:
-        # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
-        self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
deleted file mode 100644
index 138a9c5ff495d59830ec0c7761a674d7beacb834..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-
-import numpy as np
-from onnx import TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_gpt_attention import FusionGptAttentionPastBase
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-def is_close(value, expected_value):
-    return abs(value - expected_value) <= 1e-6
-
-
-class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
-    """
-    Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
-    """
-
-    def __init__(self, model: OnnxModel, num_heads: int):
-        super().__init__(model, num_heads)
-
-    def fuse_attention_node(
-        self,
-        matmul_before_split,
-        add_before_split,
-        past,
-        present,
-        input,
-        reshape_qkv,
-        mask,
-    ):
-        attention_node_name = self.model.create_node_name("GptAttention")
-        int32_mask = self.cast_attention_mask(mask)
-        output = reshape_qkv.output[0]
-        i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
-        attention_node = helper.make_node(
-            "Attention",
-            inputs=[
-                input,
-                matmul_before_split.input[1],
-                add_before_split.input[i],
-                int32_mask,
-                past,
-            ],
-            outputs=[output, present],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.microsoft"
-        attention_node.attribute.extend(
-            [
-                helper.make_attribute("num_heads", self.num_heads),
-                helper.make_attribute(
-                    "unidirectional", 0
-                ),  # unidirectional shall not be ON for 4D attention mask
-            ]
-        )
-
-        nodes_to_add = [attention_node]
-        self.nodes_to_add.extend(nodes_to_add)
-
-        for node in nodes_to_add:
-            self.node_name_to_graph_name[node.name] = self.this_graph_name
-
-        self.nodes_to_remove.append(reshape_qkv)
-
-        # we rely on prune_graph() to clean old subgraph nodes
-        self.prune_graph = True
-
-    def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
-        mask_nodes = self.model.match_parent_path(
-            sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
-        )  # yapf: disable
-        if mask_nodes is None:
-            logger.debug("fuse_attention: failed to match unidirectional mask path")
-            return None
-        (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes
-
-        if mul_qk.input[1] != last_slice_mask.output[0]:
-            logger.debug(
-                "fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]"
-            )
-            return None
-
-        if not self.utils.check_node_input_value(mul_mask, 1, 10000.0):
-            logger.debug(
-                "fuse_attention failed: mul_mask input 1 is not constant 10000.0"
-            )
-            return None
-
-        if not self.utils.check_node_input_value(sub_mask, 0, 1.0):
-            logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0")
-            return None
-
-        if not self.model.find_graph_input(slice_mask.input[0]):
-            logger.info("expect slick_mask input 0 to be graph input")
-            return None
-
-        if not self.utils.check_node_input_value(last_slice_mask, 1, [0]):
-            logger.debug(
-                "fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]"
-            )
-            return None
-
-        if not self.utils.check_node_input_value(last_slice_mask, 3, [3]):
-            logger.debug(
-                "fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]"
-            )
-            return False
-
-        if not self.utils.check_node_input_value(last_slice_mask, 4, [1]):
-            logger.debug(
-                "fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]"
-            )
-            return False
-
-        if not self.utils.check_node_input_value(slice_mask, 3, [2]):
-            logger.debug(
-                "fuse_attention failed: slice_mask input 3 (axes) is not constant [2]"
-            )
-            return None
-
-        if not self.utils.check_node_input_value(slice_mask, 4, [1]):
-            logger.debug(
-                "fuse_attention failed: slice_mask input 4 (steps) is not constant [1]"
-            )
-            return None
-
-        last_slice_path = self.model.match_parent_path(
-            last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
-        )
-        if last_slice_path is None or last_slice_path[-1] != matmul_qk:
-            logger.debug("fuse_attention: failed to match last slice path")
-            return None
-
-        first_slice_path = self.model.match_parent_path(
-            slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
-        )
-        if first_slice_path is None or first_slice_path[-1] != matmul_qk:
-            logger.debug("fuse_attention: failed to match first slice path")
-            return None
-
-        first_slice_sub = self.model.match_parent_path(
-            slice_mask,
-            ["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
-            [1, 0, 0, 0, 0],
-        )
-        if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
-            logger.debug("fuse_attention: failed to match last slice sub path")
-            return None
-
-        first_slice_sub_1 = self.model.match_parent_path(
-            slice_mask,
-            ["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
-            [1, 0, 1, 0, 0],
-        )
-        if (
-            first_slice_sub_1 is None
-            or first_slice_sub_1[-1] != layernorm_before_attention
-        ):
-            logger.debug("fuse_attention: failed to match last slice sub path 1")
-            return None
-
-        return slice_mask.input[0]
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        past = None
-        present = None
-
-        qkv_nodes = self.model.match_parent_path(
-            normalize_node,
-            ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-            [0, 1, None, 0, 0, 0],
-            output_name_to_node=output_name_to_node,
-        )  # yapf: disable
-        if qkv_nodes is None:
-            return
-        (
-            add_skip,
-            add_after_attention,
-            matmul_after_attention,
-            reshape_qkv,
-            transpose_qkv,
-            matmul_qkv,
-        ) = qkv_nodes
-
-        skip_input = add_skip.input[0]
-
-        v_nodes = self.model.match_parent_path(
-            matmul_qkv,
-            [
-                "Concat",
-                "Transpose",
-                "Reshape",
-                "Split",
-                "Add",
-                "MatMul",
-                "LayerNormalization",
-            ],
-            [1, 1, 0, 0, 0, None, 0],
-        )  # yapf: disable
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        (
-            concat_v,
-            transpose_v,
-            reshape_v,
-            split_v,
-            add_before_split,
-            matmul_before_split,
-            layernorm_before_attention,
-        ) = v_nodes
-        if skip_input != layernorm_before_attention.input[0]:
-            logger.debug(
-                "fuse_attention: skip_input != layernorm_before_attention.input[0]"
-            )
-            return
-
-        qk_nodes = self.model.match_parent_path(
-            matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0]
-        )
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return None
-        (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes
-        if self.model.get_node_attribute(softmax_qk, "axis") != 3:
-            logger.debug("fuse_attention failed: softmax_qk axis != 3")
-            return None
-
-        attention_mask = self.match_mask(
-            sub_qk, mul_qk, matmul_qk, layernorm_before_attention
-        )
-
-        q_nodes = self.model.match_parent_path(
-            matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0]
-        )
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-        (div_q, transpose_q, reshape_q, split_q) = q_nodes
-        if split_v != split_q:
-            logger.debug("fuse_attention: skip since split_v != split_q")
-            return
-
-        k_nodes = self.model.match_parent_path(
-            matmul_qk,
-            ["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
-            [1, 0, 0, 1, 0, 0],
-        )
-        if k_nodes is None:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-        (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes
-        if split_v != split_k:
-            logger.debug("fuse_attention: skip since split_v != split_k")
-            return
-
-        i, value = self.model.get_constant_input(reshape_k)
-        if not (
-            isinstance(value, np.ndarray)
-            and list(value.shape) == [4]
-            and value[0] == 0
-            and value[1] == 0
-            and value[2] > 0
-            and value[3] > 0
-        ):
-            logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
-            return
-
-        num_heads = value[2]
-        if num_heads != self.num_heads:
-            logger.info(
-                f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}"
-            )
-            self.num_heads = num_heads
-
-        hidden_size_per_head = value[3]
-        i, value = self.model.get_constant_input(div_k)
-        expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head)))
-        if not is_close(value, expected_value):
-            logger.debug(
-                f"fuse_attention: div_k value={value} expected={expected_value}"
-            )
-            return
-
-        i, value = self.model.get_constant_input(div_q)
-        if not is_close(value, expected_value):
-            logger.debug(
-                f"fuse_attention: div_q value={value} expected={expected_value}"
-            )
-            return
-
-        # Match past and present paths
-        past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
-        if past is None:
-            logger.debug("fuse_attention: match past failed")
-            return
-        if not self.model.find_graph_input(past):
-            logger.debug("fuse_attention: past is not graph input.")
-            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
-
-        present = self.match_present(concat_v, input_name_to_nodes)
-        if present is None:
-            logger.debug("fuse_attention: match present failed")
-            return
-        if not self.model.find_graph_output(present):
-            logger.info("fuse_attention: expect present to be graph output")
-            return
-
-        self.fuse_attention_node(
-            matmul_before_split,
-            add_before_split,
-            past,
-            present,
-            layernorm_before_attention.output[0],
-            reshape_qkv,
-            attention_mask,
-        )
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
deleted file mode 100644
index 4e538cf5833d096635e461eae34ab35edd20d3b1..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from logging import getLogger
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionGptAttentionNoPast(Fusion):
-    """
-    Fuse GPT-2 Attention without past state into one Attention node.
-    This does not support attention_mask graph input right now.
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model,
-            "CustomQKVToContextPluginDynamic_IxRT",
-            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
-            "without past",
-        )
-        self.where_qk_shared = None
-
-    def get_num_heads_and_hidden_size(
-        self, custom_fc: NodeProto, div: NodeProto
-    ) -> Tuple[int, int]:
-        div_initializer = self.model.get_initializer(div.input[1])
-
-        # 检查float_data是否为空
-        if len(div_initializer.float_data) > 0:
-            div_value = div_initializer.float_data[0]
-        else:
-            # 如果float_data为空，尝试其他方式获取数据
-            # 例如，如果数据存储在raw_data中
-            if len(div_initializer.raw_data) > 0:
-                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
-                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
-            else:
-                raise ValueError("Data not found in the div_initializer")
-
-        for attr in custom_fc.attribute:
-            if attr.name == "W":
-                tensor_value = attr.t
-                tensor_shape = [dim for dim in tensor_value.dims]
-                break
-        head_dim = math.ceil(div_value * div_value)
-        hidden_size = tensor_shape[1]
-        num_heads = hidden_size // head_dim
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self,
-        num_heads: int,
-        hidden_size: int,
-        input: str,
-        output: str,
-        where_qk: NodeProto,
-    ) -> Union[NodeProto, None]:
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        attention_inputs = [input]
-        if where_qk is not None:
-            has_mask = 1
-            has_qk_bias = 1
-            attention_inputs.append(where_qk.output[0])
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend(
-            [helper.make_attribute("has_qk_bias", has_qk_bias)]
-        )
-        return attention_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        return_indice = []
-        add_qkv = normalize_node
-        if normalize_node.op_type == "LayerNormalization":
-            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
-            if add_before_layernorm is not None:
-                add_qkv = add_before_layernorm
-
-        qkv_paths = {
-            "path1": (
-                ["CustomFCPluginDynamic_IxRT", "Reshape", "Transpose", "MatMul"],
-                [None, 0, 0, 0],
-            ),
-            "path2": (
-                ["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"],
-                [None, 0, 0],
-            ),
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(
-            add_qkv,
-            qkv_paths,
-            output_name_to_node,
-            return_indice,
-        )  # yapf: disable
-
-        if qkv_nodes is None:
-            return
-        reshape_2 = None
-        if qkv_path == "path1":
-            (
-                custom_fc_after_attention,
-                reshape_2,
-                transpose_qkv,
-                matmul_qkv,
-            ) = qkv_nodes
-        else:
-            (
-                custom_fc_after_attention,
-                transpose_qkv,
-                matmul_qkv,
-            ) = qkv_nodes
-
-        another_input = add_qkv.input[1 - return_indice[0]]
-
-        v_nodes = self.model.match_parent_path(
-            matmul_qkv,
-            ["Transpose", "Reshape", "Split", "CustomFCPluginDynamic_IxRT"],
-            [1, 0, 0, 0],
-        )  # yapf: disable
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        (
-            transpose_v,
-            reshape_v,
-            split_v,
-            custom_fc_before_attention,
-        ) = v_nodes
-
-        layernorm_before_attention = self.model.get_parent(
-            custom_fc_before_attention, 0, output_name_to_node
-        )
-        if (
-            layernorm_before_attention is None
-            or layernorm_before_attention.op_type != "LayerNormalization"
-        ):
-            if layernorm_before_attention.op_type != "Add":
-                logger.debug(
-                    f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}"
-                )
-                return
-
-        if not another_input in layernorm_before_attention.input:
-            # match openai-gpt
-            if not another_input in layernorm_before_attention.output:
-                logger.debug("Add and LayerNormalization shall have one same input")
-                return
-
-        qk_nodes = self.model.match_parent_path(
-            matmul_qkv, ["Softmax", "Add", "Where", "Div", "MatMul"], [0, None, 0, 1, 0]
-        )
-        where_qk = None
-        matmul_qk = None
-        mask_return_indices = []
-        if qk_nodes is not None:
-            (softmax_qk, add_qk, where_qk, div_qk, matmul_qk) = qk_nodes
-            mask_nodes = self.model.match_parent_path(
-                add_qk,
-                ["Mul", "Sub", "Cast", "Unsqueeze"],
-                [None, 0, 1, 0],
-                return_indice=mask_return_indices,
-            )  # yapf: disable
-            if mask_nodes is None:
-                logger.debug("fuse_attention: failed to match mask path")
-                return
-
-        q_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
-        )
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-        (transpose_q, reshape_q, split_q) = q_nodes
-        if split_v != split_q:
-            logger.debug("fuse_attention: skip since split_v != split_q")
-            return
-
-        k_nodes = self.model.match_parent_path(
-            matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0]
-        )
-        if k_nodes is None:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-        (transpose_k, reshape_k, split_k) = k_nodes
-        if split_v != split_k:
-            logger.debug("fuse_attention: skip since split_v != split_k")
-            return
-
-        if where_qk is None:
-            return
-
-        global num_heads, hidden_size
-        if self.where_qk_shared is None:
-            where_qk.input[1] = mask_nodes[0].output[0]
-            div_qk.output[0] = where_qk.output[0]
-            add_qk.input[1 - mask_return_indices[0]] = div_qk.output[0]
-            self.where_qk_shared = where_qk
-            self.nodes_to_remove.extend([softmax_qk, add_qk, div_qk, matmul_qk])
-            
-            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
-                custom_fc_after_attention, div_qk
-            )
-            self.nodes_to_remove.extend([k_nodes[0]])
-            self.nodes_to_remove.extend(v_nodes[:-2])
-        else:
-            self.nodes_to_remove.extend(
-                [softmax_qk, add_qk, where_qk, div_qk, matmul_qk]
-            )      
-            self.nodes_to_remove.extend(q_nodes)
-            self.nodes_to_remove.extend(k_nodes)
-            self.nodes_to_remove.extend(v_nodes[:-1])
-
-        new_node = self.create_attention_node(
-            num_heads,
-            hidden_size,
-            custom_fc_before_attention.output[0],
-            transpose_qkv.output[0] if reshape_2 is None else reshape_2.output[0],
-            self.where_qk_shared,
-        )
-
-        self.nodes_to_add.append(new_node)
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-        if reshape_2 is not None:
-            self.nodes_to_remove.extend([reshape_2])
-        self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
-        
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
deleted file mode 100644
index d19c3aff604ed6f3ae673ffa0c67143b66e36aaf..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict
-
-import numpy as np
-from onnx import TensorProto, helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionLayerNormalization(Fusion):
-    def __init__(self, model: OnnxModel, hidden_size):
-        self.hidden_size = hidden_size
-        super().__init__(model, "LayerNormalization", "ReduceMean")
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        Fuse Layer Normalization subgraph into one node LayerNormalization:
-              +----------------------+
-              |                      |
-              |                      v
-          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
-                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0)    ^
-                                     |                                               |
-                                     +-----------------------------------------------+
-
-         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
-              +----------------------+
-              |                      v
-              |           +-------> Sub-----------------------------------------------+
-              |           |                                                           |
-              |           |                                                           v
-          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
-              |                      ^
-              |                      |
-              +----------------------+
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        if len(children) == 0 or len(children) > 2:
-            return
-
-        root_input = node.input[0]
-
-        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
-            return
-
-        if len(children) == 2:
-            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
-                return
-
-        div_node = None
-        for child in children:
-            div_node = self.model.find_first_child_by_type(
-                child, "Div", input_name_to_nodes, recursive=False
-            )
-            if div_node is not None:
-                break
-        if div_node is None:
-            return
-
-        path_id, parent_nodes, _ = self.model.match_parent_paths(
-            div_node,
-            [
-                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
-                (
-                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
-                    [1, 0, 0, 0, 0, 0],
-                ),
-            ],
-            output_name_to_node,
-        )
-        if path_id < 0:
-            return
-
-        sub_node = parent_nodes[-1]
-        if sub_node not in children:
-            return
-
-        second_add_node = parent_nodes[1]
-        i, add_weight = self.model.get_constant_input(second_add_node)
-        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
-            logger.warning(f"epsilon value is not expeced: {add_weight}")
-            return
-
-        pow_node = parent_nodes[3]
-        if not self.model.find_constant_input(pow_node, 2.0) == 1:
-            return
-
-        mul_node = input_name_to_nodes[div_node.output[0]][0]
-        is_not_have_mul_and_add = False
-        is_not_have_mul_and_add_lst_node = None
-        # deal with special case : layernorm do not have mul and add
-        if mul_node.op_type != "Mul" and mul_node.op_type == "MatMul":
-            is_not_have_mul_and_add = True
-            is_not_have_mul_and_add_lst_node = div_node
-        elif mul_node.op_type != "Mul":
-            return
-
-        if is_not_have_mul_and_add:
-            last_add_node = is_not_have_mul_and_add_lst_node
-            if self.hidden_size == 0:
-                print(
-                    "[Error] Please add '--hidden_size' and '--num_head' to fuse layernorm ..."
-                )
-                exit(0)
-
-            subgraph_nodes = [node]
-            subgraph_nodes.extend(children)
-            subgraph_nodes.extend(parent_nodes[:-1])
-            subgraph_nodes.extend([last_add_node])
-            if len(subgraph_nodes) == 7:
-                self.nodes_to_remove.extend(subgraph_nodes)
-            else:
-                return
-
-            norm_name = self.model.create_node_name(
-                "LayerNormalization", name_prefix="LayerNorm"
-            )
-            np_weights = np.ones((self.hidden_size)).astype(np.float32)
-            np_weights_name = norm_name + "_weights"
-            weights_tensor = helper.make_tensor(
-                np_weights_name, TensorProto.FLOAT, np_weights.shape, np_weights
-            )
-            np_bias = np.zeros((self.hidden_size)).astype(np.float32)
-            np_bias_name = norm_name + "_bias"
-            bias_tensor = helper.make_tensor(
-                np_bias_name, TensorProto.FLOAT, np_bias.shape, np_bias
-            )
-            self.model.add_initializer(weights_tensor)
-            self.model.add_initializer(bias_tensor)
-            normalize_node = helper.make_node(
-                "LayerNormalization",
-                inputs=[node.input[0], np_weights_name, np_bias_name],
-                outputs=[last_add_node.output[0]],
-                name=norm_name,
-            )
-            normalize_node.attribute.extend(
-                [helper.make_attribute("epsilon", float(add_weight))]
-            )
-            self.nodes_to_add.append(normalize_node)
-            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-        else:
-            last_add_node = input_name_to_nodes[mul_node.output[0]][0]
-            if last_add_node.op_type != "Add":
-                return
-
-            subgraph_nodes = [node]
-            subgraph_nodes.extend(children)
-            subgraph_nodes.extend(parent_nodes[:-1])
-
-            subgraph_nodes.extend([last_add_node, mul_node, div_node])
-            if not self.model.is_safe_to_fuse_nodes(
-                subgraph_nodes,
-                last_add_node.output,
-                input_name_to_nodes,
-                output_name_to_node,
-            ):
-                logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
-                return
-
-            weight_input = mul_node.input[
-                1 - self.model.input_index(div_node.output[0], mul_node)
-            ]
-            if not self.model.is_constant_with_specified_dimension(
-                weight_input, 1, "layernorm weight"
-            ):
-                return
-
-            bias_input = last_add_node.input[
-                1 - self.model.input_index(mul_node.output[0], last_add_node)
-            ]
-            if not self.model.is_constant_with_specified_dimension(
-                bias_input, 1, "layernorm bias"
-            ):
-                return
-
-            self.nodes_to_remove.extend(subgraph_nodes)
-            normalize_node = helper.make_node(
-                "LayerNormalization",
-                inputs=[node.input[0], weight_input, bias_input],
-                outputs=[last_add_node.output[0]],
-                name=self.model.create_node_name(
-                    "LayerNormalization", name_prefix="LayerNorm"
-                ),
-            )
-            normalize_node.attribute.extend(
-                [helper.make_attribute("epsilon", float(add_weight))]
-            )
-            self.nodes_to_add.append(normalize_node)
-            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionLayerNormalizationKeras(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "LayerNormalization", "GlobalAveragePool", "Keras layernorm"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-          +-------------------------------+
-          |                               |
-          |                               v
-        [Root] -->  GlobalAveragePool-->  Sub  --> Mul --> GlobalAveragePool --> Add/Min/Max --> Sqrt --> Div --> Mul --> Add
-                                           |                                                               ^
-                                           |                                                               |
-                                           +---------------------------------------------------------------+
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        # print(len(children))
-        if len(children) != 1:
-            return
-
-        root_input = node.input[0]
-
-        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
-            return
-
-        div_node = None
-        for child in children:
-            div_node = self.model.find_first_child_by_type(
-                child, "Div", input_name_to_nodes, recursive=False
-            )
-            if div_node is not None:
-                break
-        if div_node is None:
-            return
-        # print('div_node_name:', div_node.name)
-        path_id, parent_nodes, _ = self.model.match_parent_paths(
-            div_node,
-            [
-                (
-                    ["Sqrt", "Max", "Min", "Add", "GlobalAveragePool", "Mul", "Sub"],
-                    [1, 0, 0, 0, None, 0, None],
-                ),
-            ],
-            output_name_to_node,
-        )
-        if path_id < 0:
-            return
-
-        sub_node = parent_nodes[-1]
-        if sub_node not in children:
-            return
-
-        second_add_node = parent_nodes[3]
-        i, add_weight = self.model.get_constant_input(second_add_node)
-        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
-            logger.warning(f"epsilon value is not expeced: {add_weight}")
-            return
-
-        mul_node = input_name_to_nodes[div_node.output[0]][0]
-        if mul_node.op_type != "Mul":
-            return
-
-        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
-        if last_add_node.op_type != "Add":
-            return
-
-        subgraph_nodes = [node]
-        subgraph_nodes.extend(children)
-        subgraph_nodes.extend(parent_nodes[:-1])
-
-        subgraph_nodes.extend([last_add_node, mul_node, div_node])
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            last_add_node.output,
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
-            return
-
-        weight_input = mul_node.input[
-            1 - self.model.input_index(div_node.output[0], mul_node)
-        ]
-        if not self.model.is_constant_with_specified_dimension(
-            weight_input, 1, "layernorm weight"
-        ):
-            return
-
-        bias_input = last_add_node.input[
-            1 - self.model.input_index(mul_node.output[0], last_add_node)
-        ]
-        if not self.model.is_constant_with_specified_dimension(
-            bias_input, 1, "layernorm bias"
-        ):
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        normalize_node = helper.make_node(
-            "LayerNormalization",
-            inputs=[node.input[0], weight_input, bias_input],
-            outputs=[last_add_node.output[0]],
-            name=self.model.create_node_name(
-                "LayerNormalization", name_prefix="LayerNorm"
-            ),
-        )
-        normalize_node.attribute.extend(
-            [helper.make_attribute("epsilon", float(add_weight))]
-        )
-        self.nodes_to_add.append(normalize_node)
-        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionLayerNormalizationTF(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "LayerNormalization", "Add", "TF")
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-         Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
-          +------------------------------------+
-          |                                    |
-          |                                    |
-        (Cast_1)                               |
-          |                                    |
-          |                                    v                                           (B)                             (B)             (A)
-         Add --> (Cast_1) --> ReduceMean -->  Sub  --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
-          |                       |                                                                                         |       ^              ^
-          |                       |                                                                                         |       |              |
-          |                       +--------------------------------------------------(Cast_2)-------------------------------|-------+              |
-          |                                                                                                                 v                      |
-          +---------------------------------------------------------------------------------------------------------------> Mul--------------------+
-        """
-        return_indice = []
-        _, parent_nodes, return_indice = self.model.match_parent_paths(
-            node,
-            [
-                (
-                    [
-                        "Sub",
-                        "Mul",
-                        "Mul",
-                        "Reciprocal",
-                        "Sqrt",
-                        "Add",
-                        "ReduceMean",
-                        "Mul",
-                        "Sub",
-                        "ReduceMean",
-                    ],
-                    [1, 1, None, 0, 0, 0, None, 0, 0, None],
-                ),
-                (
-                    [
-                        "Sub",
-                        "Mul",
-                        "Mul",
-                        "Reciprocal",
-                        "Sqrt",
-                        "Add",
-                        "Cast",
-                        "ReduceMean",
-                        "Mul",
-                        "Sub",
-                        "ReduceMean",
-                    ],
-                    [1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
-                ),
-            ],
-            output_name_to_node,
-        )  # yapf: disable
-
-        if parent_nodes is None:
-            return
-
-        assert len(return_indice) == 3
-        if not (
-            return_indice[0] in [0, 1]
-            and return_indice[1] in [0, 1]
-            and return_indice[2] in [0, 1]
-        ):
-            logger.debug(
-                "return indice is exepected in [0, 1], but got {return_indice}"
-            )
-            return
-
-        (
-            sub_node_0,
-            mul_node_0,
-            mul_node_1,
-            reciprocol_node,
-            sqrt_node,
-            add_node_0,
-        ) = parent_nodes[:6]
-        reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[
-            -4:
-        ]
-
-        cast_node_3 = None
-        if len(parent_nodes) == 11:
-            cast_node_3 = parent_nodes[6]
-            assert cast_node_3.op_type == "Cast"
-
-        mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
-        if mul_node_3 is None:
-            logger.debug("mul_node_3 not found")
-            return
-
-        node_before_reduce = self.model.get_parent(
-            reduce_mean_node_1, 0, output_name_to_node
-        )
-        root_node = (
-            node_before_reduce
-            if cast_node_3 is None
-            else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
-        )
-        if root_node is None:
-            logger.debug("root node is none")
-            return
-
-        i, epsilon = self.model.get_constant_input(add_node_0)
-        if (
-            epsilon is None
-            or epsilon <= 0
-            or (epsilon > 1.0e-5 and cast_node_3 is None)
-        ):
-            logger.debug("epsilon is not matched")
-            return
-
-        if cast_node_3 is None and (
-            reduce_mean_node_1.input[0] not in mul_node_3.input
-            or reduce_mean_node_1.input[0] not in sub_node_1.input
-        ):
-            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
-            return
-
-        if cast_node_3 is not None and (
-            node_before_reduce.input[0] not in mul_node_3.input
-            or reduce_mean_node_1.input[0] not in sub_node_1.input
-        ):
-            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
-            return
-
-        if mul_node_2.input[0] != mul_node_2.input[1]:
-            logger.debug("mul_node_2 shall have two same inputs")
-            return
-
-        subgraph_nodes = [
-            node,
-            sub_node_0,
-            mul_node_0,
-            mul_node_1,
-            reciprocol_node,
-            sqrt_node,
-            add_node_0,
-            reduce_mean_node_0,
-            mul_node_2,
-            sub_node_1,
-            reduce_mean_node_1,
-            mul_node_3,
-        ]
-
-        if cast_node_3 is not None:
-            cast_node_2 = self.model.match_parent(
-                mul_node_0, "Cast", 0, output_name_to_node
-            )
-            if cast_node_2 is None:
-                logger.debug("cast_node_2 not found")
-                return
-            subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
-
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            node.output,
-            self.model.input_name_to_nodes(),
-            self.model.output_name_to_node(),
-        ):
-            logger.debug("not safe to fuse layer normalization")
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-
-        weight_input = mul_node_1.input[1]
-        bias_input = sub_node_0.input[0]
-
-        # TODO: add epsilon attribute
-        fused_node = helper.make_node(
-            "LayerNormalization",
-            inputs=[mul_node_3.input[0], weight_input, bias_input],
-            outputs=[node.output[0]],
-            name=self.model.create_node_name(
-                "LayerNormalization", name_prefix="LayerNorm"
-            ),
-        )
-        fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
deleted file mode 100644
index c0bb11b3bdd6bcbb994b8ad83501be2d9c1c4505..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from argparse import ArgumentParser
-
-
-class AttentionMaskFormat:
-    MaskIndexEnd = 0
-    MaskIndexEndAndStart = 1
-    AttentionMask = 2
-    NoMask = 3
-
-
-class FusionOptions:
-    """Options of fusion in graph optimization"""
-
-    def __init__(self, model_type):
-        self.enable_gelu = True
-        self.enable_layer_norm = True
-        self.enable_attention = True
-        self.enable_skip_layer_norm = True
-        self.enable_embed_layer_norm = True
-        self.enable_bias_skip_layer_norm = True
-        self.enable_bias_gelu = True
-        self.enable_gelu_approximation = False
-        self.enable_qordered_matmul = True
-
-        self.enable_shape_inference = True
-        self.enable_swint_opt = False
-        self.enable_format_roformer = False
-        self.enable_gpt2_classify = False
-        self.enable_vit = False
-        self.enable_omdet = False
-        self.attention_mask_format = AttentionMaskFormat.AttentionMask
-
-        if model_type == "gpt2":
-            self.enable_skip_layer_norm = False
-            self.enable_gpt2_classify = True
-        elif model_type == "swint":
-            self.enable_swint_opt = True
-        elif model_type == "roformer":
-            self.enable_format_roformer = True
-        elif model_type == "vit":
-            self.enable_vit = True
-        elif model_type == "omdet":
-            self.enable_omdet = True
-
-    def use_raw_attention_mask(self, use_raw_mask=True):
-        if use_raw_mask:
-            self.attention_mask_format = AttentionMaskFormat.AttentionMask
-        else:
-            self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd
-
-    def disable_attention_mask(self):
-        self.attention_mask_format = AttentionMaskFormat.NoMask
-
-    @staticmethod
-    def parse(args):
-        options = FusionOptions(args.model_type)
-        if args.disable_gelu:
-            options.enable_gelu = False
-        if args.disable_layer_norm:
-            options.enable_layer_norm = False
-        if args.disable_attention:
-            options.enable_attention = False
-        if args.disable_skip_layer_norm:
-            options.enable_skip_layer_norm = False
-        if args.disable_embed_layer_norm:
-            options.enable_embed_layer_norm = False
-        if args.disable_bias_skip_layer_norm:
-            options.enable_bias_skip_layer_norm = False
-        if args.disable_bias_gelu:
-            options.enable_bias_gelu = False
-        if args.enable_gelu_approximation:
-            options.enable_gelu_approximation = True
-        if args.disable_shape_inference:
-            options.enable_shape_inference = False
-        if args.use_mask_index:
-            options.use_raw_attention_mask(False)
-        if args.no_attention_mask:
-            options.disable_attention_mask()
-        return options
-
-    @staticmethod
-    def add_arguments(parser: ArgumentParser):
-        parser.add_argument(
-            "--disable_attention",
-            required=False,
-            action="store_true",
-            help="disable Attention fusion",
-        )
-        parser.set_defaults(disable_attention=False)
-
-        parser.add_argument(
-            "--disable_skip_layer_norm",
-            required=False,
-            action="store_true",
-            help="disable SkipLayerNormalization fusion",
-        )
-        parser.set_defaults(disable_skip_layer_norm=False)
-
-        parser.add_argument(
-            "--disable_embed_layer_norm",
-            required=False,
-            action="store_true",
-            help="disable EmbedLayerNormalization fusion",
-        )
-        parser.set_defaults(disable_embed_layer_norm=False)
-
-        parser.add_argument(
-            "--disable_bias_skip_layer_norm",
-            required=False,
-            action="store_true",
-            help="disable Add Bias and SkipLayerNormalization fusion",
-        )
-        parser.set_defaults(disable_bias_skip_layer_norm=False)
-
-        parser.add_argument(
-            "--disable_bias_gelu",
-            required=False,
-            action="store_true",
-            help="disable Add Bias and Gelu/FastGelu fusion",
-        )
-        parser.set_defaults(disable_bias_gelu=False)
-
-        parser.add_argument(
-            "--disable_layer_norm",
-            required=False,
-            action="store_true",
-            help="disable LayerNormalization fusion",
-        )
-        parser.set_defaults(disable_layer_norm=False)
-
-        parser.add_argument(
-            "--disable_gelu",
-            required=False,
-            action="store_true",
-            help="disable Gelu fusion",
-        )
-        parser.set_defaults(disable_gelu=False)
-
-        parser.add_argument(
-            "--enable_gelu_approximation",
-            required=False,
-            action="store_true",
-            help="enable Gelu/BiasGelu to FastGelu conversion",
-        )
-        parser.set_defaults(enable_gelu_approximation=False)
-
-        parser.add_argument(
-            "--disable_shape_inference",
-            required=False,
-            action="store_true",
-            help="disable symbolic shape inference",
-        )
-        parser.set_defaults(disable_shape_inference=False)
-
-        parser.add_argument(
-            "--use_mask_index",
-            required=False,
-            action="store_true",
-            help="use mask index instead of raw attention mask in attention operator",
-        )
-        parser.set_defaults(use_mask_index=False)
-
-        parser.add_argument(
-            "--no_attention_mask",
-            required=False,
-            action="store_true",
-            help="no attention mask. Only works for model_type=bert",
-        )
-        parser.set_defaults(no_attention_mask=False)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
deleted file mode 100644
index 9afa3edbc37f2ddd7b15c3eb976ee1cd9e72e356..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple
-
-import numpy as np
-from onnx import NodeProto, helper
-
-from .fusion_attention import AttentionMask
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedAttention(Fusion):
-    def __init__(
-        self,
-        model: OnnxModel,
-        hidden_size: int,
-        num_heads: int,
-        attention_mask: AttentionMask,
-    ):
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.attention_mask = attention_mask
-
-        super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")
-
-    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape = self.model.get_initializer(reshape_q.input[1])
-        if q_shape is None:
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
-
-            # Check if the second input to Reshape flows through a Constant node
-            # TODO: Investigate why FusionAttention doesn't have such logic
-            constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])
-
-            if constant_node is None:
-                return (
-                    self.num_heads,
-                    self.hidden_size,
-                )  # Fall back to user specified value
-            else:
-                constant_node = constant_node[0]
-
-                if len(constant_node.attribute) != 1:
-                    return (
-                        self.num_heads,
-                        self.hidden_size,
-                    )  # Fall back to user specified value
-
-                # This is assuming it is a Tensor attribute (this is a safe assumption)
-                q_shape = constant_node.attribute[0].t
-
-        q_shape_value = NumpyHelper.to_array(q_shape)
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(
-                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        num_heads = q_shape_value[2]
-        head_size = q_shape_value[3]
-        hidden_size = num_heads * head_size
-
-        if self.num_heads > 0 and num_heads != self.num_heads:
-            if self.num_heads_warning:
-                logger.warning(
-                    f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
-                )
-                self.num_heads_warning = False  # Do not show the warning more than once
-
-        if self.hidden_size > 0 and hidden_size != self.hidden_size:
-            if self.hidden_size_warning:
-                logger.warning(
-                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
-                )
-                self.hidden_size_warning = (
-                    False  # Do not show the warning more than once
-                )
-
-        return num_heads, hidden_size
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        add_before_layernorm = self.model.match_parent_path(
-            normalize_node,
-            ["QuantizeLinear", "Add"],
-            [0, 0],
-        )
-
-        if add_before_layernorm is not None:
-            start_node = add_before_layernorm[-1]
-        else:
-            return
-
-        # Input QDQ nodes
-        dequantize_input = self.model.match_parent_path(
-            start_node,
-            ["DequantizeLinear"],
-            [None],
-        )
-
-        if dequantize_input is None:
-            logger.debug(
-                "fuse_qordered_attention: failed to match input qdq nodes path"
-            )
-            return
-
-        dequantize_input = dequantize_input[-1]
-
-        # QKV nodes
-        qkv_nodes = self.model.match_parent_path(
-            start_node,
-            [
-                "Add",
-                "MatMul",
-                "Reshape",
-                "Transpose",
-                "DequantizeLinear",
-                "QuantizeLinear",
-                "MatMul",
-            ],
-            [None, None, 0, 0, 0, 0, 0],
-        )
-
-        if qkv_nodes is None:
-            logger.debug("fuse_qordered_attention: failed to match qkv path")
-            return
-
-        (
-            _,
-            projection_matmul,
-            reshape_qkv,
-            transpose_qkv,
-            dequantize_qkv,
-            quantize_qkv,
-            matmul_qkv,
-        ) = qkv_nodes
-
-        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
-        if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):
-            return
-
-        # Identify the root input to the Attention node
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
-
-            if input == qkv_nodes[0].output[0]:
-                continue
-
-            other_inputs.append(input)
-
-        if len(other_inputs) != 1:
-            return
-
-        root_input = other_inputs[0]
-
-        # V nodes
-        v_nodes = self.model.match_parent_path(
-            matmul_qkv,
-            [
-                "Transpose",
-                "Reshape",
-                "DequantizeLinear",
-                "QuantizeLinear",
-                "Add",
-                "MatMul",
-            ],
-            [1, 0, 0, 0, 0, None],
-        )
-
-        if v_nodes is None:
-            logger.debug("fuse_qordered_attention: failed to match v path")
-            return
-
-        (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes
-
-        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
-        if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):
-            return
-
-        # V MatMul weight
-        dequantize_v_matmul_weight = self.model.match_parent_path(
-            matmul_v, ["DequantizeLinear"], [1]
-        )
-
-        if dequantize_v_matmul_weight is None:
-            logger.debug("fuse_qordered_attention: failed to match v path")
-            return
-
-        dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]
-
-        if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:
-            return
-
-        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
-        # Per-channel scales are supported for weights alone
-        if not FusionUtils.check_qdq_node_for_fusion(
-            dequantize_v_matmul_weight, self.model, False
-        ):
-            return
-
-        # QK nodes
-        qk_nodes = self.model.match_parent_path(
-            matmul_qkv,
-            [
-                "DequantizeLinear",
-                "QuantizeLinear",
-                "Softmax",
-                "Add",
-                "Div",
-                "DequantizeLinear",
-                "QuantizeLinear",
-                "MatMul",
-            ],
-            [0, 0, 0, 0, None, 0, 0, 0],
-        )
-
-        if qk_nodes is None:
-            logger.debug("fuse_qordered_attention: failed to match qk path")
-            return
-
-        (
-            dequantize_qk_softmax,
-            quantize_qk_softmax,
-            softmax_qk,
-            add_qk,
-            div_qk,
-            dequantize_qk,
-            quantize_qk,
-            matmul_qk,
-        ) = qk_nodes
-
-        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
-        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):
-            return
-
-        # Q nodes
-        q_nodes = self.model.match_parent_path(
-            matmul_qk,
-            [
-                "Transpose",
-                "Reshape",
-                "DequantizeLinear",
-                "QuantizeLinear",
-                "Add",
-                "MatMul",
-            ],
-            [0, 0, 0, 0, 0, None],
-        )
-
-        if q_nodes is None:
-            logger.debug("fuse_qordered_attention: failed to match q path")
-            return
-
-        (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes
-
-        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
-        if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):
-            return
-
-        # Q MatMul weight
-        dequantize_q_matmul_weight = self.model.match_parent_path(
-            matmul_q, ["DequantizeLinear"], [1]
-        )
-
-        if dequantize_q_matmul_weight is None:
-            logger.debug("fuse_qordered_attention: failed to match q path")
-            return
-
-        dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]
-
-        if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:
-            return
-
-        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
-        # Per-channel scales are supported for weights alone
-        if not FusionUtils.check_qdq_node_for_fusion(
-            dequantize_q_matmul_weight, self.model, False
-        ):
-            return
-
-        # K nodes
-        k_nodes = self.model.match_parent_path(
-            matmul_qk,
-            [
-                "Transpose",
-                "Reshape",
-                "DequantizeLinear",
-                "QuantizeLinear",
-                "Add",
-                "MatMul",
-            ],
-            [1, 0, 0, 0, 0, None],
-        )
-
-        if k_nodes is None:
-            logger.debug("fuse_qordered_attention: failed to match k path")
-            return
-
-        (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes
-
-        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
-        if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):
-            return
-
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):
-            return
-
-        # K MatMul weight
-        dequantize_k_matmul_weight = self.model.match_parent_path(
-            matmul_k, ["DequantizeLinear"], [1]
-        )
-
-        if dequantize_k_matmul_weight is None:
-            logger.debug("fuse_qordered_attention: failed to match k path")
-            return
-
-        dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]
-
-        if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:
-            return
-
-        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
-        # Per-channel scales are supported for weights alone
-        if not FusionUtils.check_qdq_node_for_fusion(
-            dequantize_k_matmul_weight, self.model, False
-        ):
-            return
-
-        # Mask nodes
-        mask_nodes = self.model.match_parent_path(
-            add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]
-        )
-
-        if mask_nodes is None:
-            logger.debug("fuse_qordered_attention: failed to match mask_nodes path")
-            return
-
-        # Ascertain `qkv_hidden_sizes` attribute value
-        q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
-        k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
-        v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
-
-        qw = NumpyHelper.to_array(q_weight)
-        kw = NumpyHelper.to_array(k_weight)
-        vw = NumpyHelper.to_array(v_weight)
-
-        qw_out_size = np.prod(qw.shape[1:])
-        kw_out_size = np.prod(kw.shape[1:])
-        vw_out_size = np.prod(vw.shape[1:])
-
-        # Form QOrderedAttention node
-        if (
-            matmul_v.input[0] == root_input
-            and matmul_q.input[0] == root_input
-            and matmul_k.input[0] == root_input
-        ):
-            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
-
-            # Ascertain `num_heads` and `hidden_size`
-            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
-
-            # Formulate the inputs
-            # Actual quantized input
-            attention_inputs = [dequantize_input.input[0]]
-            attention_inputs.append(dequantize_input.input[1])
-
-            attention_inputs.append(dequantize_q.input[1])
-            attention_inputs.append(dequantize_k.input[1])
-            attention_inputs.append(dequantize_v.input[1])
-
-            attention_inputs.append(dequantize_q_matmul_weight.input[0])
-            attention_inputs.append(dequantize_k_matmul_weight.input[0])
-            attention_inputs.append(dequantize_v_matmul_weight.input[0])
-
-            attention_inputs.append(dequantize_q_matmul_weight.input[1])
-            attention_inputs.append(dequantize_k_matmul_weight.input[1])
-            attention_inputs.append(dequantize_v_matmul_weight.input[1])
-
-            if self.model.get_initializer(add_q.input[0]):
-                attention_inputs.append(add_q.input[0])
-            else:  # second input is the constant bias
-                attention_inputs.append(add_q.input[1])
-
-            if self.model.get_initializer(add_k.input[0]):
-                attention_inputs.append(add_k.input[0])
-            else:  # second input is the constant bias
-                attention_inputs.append(add_k.input[1])
-
-            if self.model.get_initializer(add_v.input[0]):
-                attention_inputs.append(add_v.input[0])
-            else:  # second input is the constant bias
-                attention_inputs.append(add_v.input[1])
-
-            attention_inputs.append(quantize_qk.input[1])
-            attention_inputs.append(quantize_qk_softmax.input[1])
-            attention_inputs.append(dequantize_qkv.input[1])
-
-            # Mask input
-            if mask_index is not None:
-                attention_inputs.append(mask_index)
-            else:
-                attention_inputs.append("")
-
-            # The MatMul weight 'B' and 'bias' need some post-processing
-            # Transpose weight 'B' from order ROW to order COL
-            # This offline transpose is needed only while using the CUDA EP
-            # TODO: Make this fusion logic EP-agnostic ?
-            q_weight_tensor = self.model.get_initializer(
-                dequantize_q_matmul_weight.input[0]
-            )
-            FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)
-
-            k_weight_tensor = self.model.get_initializer(
-                dequantize_k_matmul_weight.input[0]
-            )
-            FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)
-
-            v_weight_tensor = self.model.get_initializer(
-                dequantize_v_matmul_weight.input[0]
-            )
-            FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)
-
-            # Name and create Attention node
-            attention_node_name = self.model.create_node_name("QOrderedAttention")
-
-            attention_node = helper.make_node(
-                "QOrderedAttention",
-                inputs=attention_inputs,
-                outputs=[reshape_qkv.output[0]],
-                name=attention_node_name,
-            )
-
-            self.model.replace_node_input(
-                dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0]
-            )
-            self.model.replace_node_input(
-                projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0]
-            )
-
-            attention_node.attribute.extend(
-                [helper.make_attribute("num_heads", num_heads)]
-            )
-            attention_node.attribute.extend([helper.make_attribute("order_input", 1)])
-            attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])
-            attention_node.attribute.extend([helper.make_attribute("order_output", 1)])
-            attention_node.attribute.extend(
-                [
-                    helper.make_attribute(
-                        "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
-                    )
-                ]
-            )
-
-            attention_node.domain = "com.microsoft"
-
-            self.nodes_to_add.append(attention_node)
-            self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-
-            self.nodes_to_remove.extend(
-                [reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv]
-            )
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes)
-            self.nodes_to_remove.extend(k_nodes)
-            self.nodes_to_remove.extend(v_nodes)
-            self.nodes_to_remove.extend(
-                [
-                    dequantize_q_matmul_weight,
-                    dequantize_k_matmul_weight,
-                    dequantize_v_matmul_weight,
-                ]
-            )
-
-            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
-            # self.nodes_to_remove.extend(mask_nodes)
-            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
deleted file mode 100644
index ebd165c4bc5da002eb53b2376c1e69facf40dec4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedGelu(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"])
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        INPUT PATTERN
-        Fuse (quantized) Gelu subgraph into one node QOrderedGelu:
-            -> quantized input  -> DQ -> Gelu -> Q ->
-
-        (or)
-
-            -> quantized input  -> DQ -> FastGelu -> Q ->
-
-        OUTPUT PATTERN
-            -> QOrderedGelu ->
-        """
-        gelu_children = self.model.get_children(node, input_name_to_nodes)
-
-        # Should only have 1 child - QuantizeLinear (or)
-        # Should have 2 children - QuantizeLinear + Shape
-        if not (
-            (len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear")
-            or (
-                len(gelu_children) == 2
-                and gelu_children[0].op_type == "QuantizeLinear"
-                and gelu_children[1].op_type == "Shape"
-            )
-        ):
-            return
-
-        downstream_quantize_node = gelu_children[0]
-        downstream_shape_node = None
-
-        if len(gelu_children) == 2:
-            downstream_shape_node = gelu_children[1]
-
-        if not FusionUtils.check_qdq_node_for_fusion(
-            downstream_quantize_node, self.model
-        ):
-            return
-
-        # The first input to Gelu should flow through a DequantizeLinear node
-        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
-            node,
-            [(["DequantizeLinear"], [0])],
-            output_name_to_node,
-        )
-
-        if first_path_id < 0:
-            return
-
-        upstream_dequantize_node = first_input_parent_nodes[0]
-
-        if not FusionUtils.check_qdq_node_for_fusion(
-            upstream_dequantize_node, self.model
-        ):
-            return
-
-        # Fusion logic
-        subgraph_nodes = [node]  # Gelu/FastGelu
-        subgraph_nodes.extend(
-            [downstream_quantize_node, upstream_dequantize_node]
-        )  # Relevant Q, DQ nodes
-
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-
-        ordered_gelu_node = helper.make_node(
-            "QOrderedGelu",
-            inputs=[
-                upstream_dequantize_node.input[0],
-                upstream_dequantize_node.input[1],
-                downstream_quantize_node.input[1],
-            ],
-            outputs=[downstream_quantize_node.output[0]],
-            name=self.model.create_node_name(
-                "QOrderedGelu", name_prefix="QOrderedGelu"
-            ),
-        )
-
-        # Arrange the downstream Shape's input to be fed from the
-        # downstream QuantizeLinear node, so that fusion will
-        # be deemed safe
-        if downstream_shape_node is not None:
-            self.model.replace_node_input(
-                downstream_shape_node,
-                downstream_shape_node.input[0],
-                downstream_quantize_node.output[0],
-            )
-
-        # TODO: We only support CuBlasLt order ORDER_ROW for now.
-        # Once we start supporting other data ordering format(s), we
-        # will support user configuring the data ordering for the op.
-        ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)])
-        ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)])
-
-        ordered_gelu_node.domain = "com.microsoft"
-
-        self.nodes_to_add.append(ordered_gelu_node)
-        self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
deleted file mode 100644
index 94e38a0f5b549cb217359926172eb4aa510ad68b..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedLayerNormalization(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization")
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization:
-            quantized input  -> DQ
-                                |
-                                |
-            (other inputs)-> LayerNormalization --> Q -->
-
-            should become
-
-            (quantized input + other inputs)->  QOrderedLayerNormalization --> Q -->
-        """
-
-        children = self.model.get_children(node, input_name_to_nodes)
-
-        # Should only have 1 child - QuantizeLinear (or)
-        # Should have 2 children - QuantizeLinear + Shape
-        if not (
-            (len(children) == 1 and children[0].op_type == "QuantizeLinear")
-            or (
-                len(children) == 2
-                and children[0].op_type == "QuantizeLinear"
-                and children[1].op_type == "Shape"
-            )
-        ):
-            return
-
-        downstream_quantize_node = children[0]
-        downstream_shape_node = None
-
-        if len(children) == 2:
-            downstream_shape_node = children[1]
-
-        if not FusionUtils.check_qdq_node_for_fusion(
-            downstream_quantize_node, self.model
-        ):
-            return
-
-        # The first input to LayerNormalization should flow through a DequantizeLinear node
-        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
-            node,
-            [(["DequantizeLinear"], [0])],
-            output_name_to_node,
-        )
-
-        if first_path_id < 0:
-            return
-
-        upstream_dequantize_node = first_input_parent_nodes[0]
-
-        if not FusionUtils.check_qdq_node_for_fusion(
-            upstream_dequantize_node, self.model
-        ):
-            return
-
-        # Fusion logic
-        subgraph_nodes = [node]  # LayerNormalization
-        subgraph_nodes.extend(
-            [downstream_quantize_node]
-        )  # Q node after LayerNormalization
-
-        upstream_dequantize_node_children = self.model.get_children(
-            upstream_dequantize_node, input_name_to_nodes
-        )
-
-        # In GPT2, the DQ node will be feeding a residual downstream Add and hence,
-        # we do not want to remove it
-        if len(upstream_dequantize_node_children) == 1:
-            subgraph_nodes.extend(
-                [upstream_dequantize_node]
-            )  # DQ node before LayerNormalization
-
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            logger.debug(
-                f"It is not safe to fuse QOrderedLayerNormalization node. Skip"
-            )
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-
-        normalize_node = helper.make_node(
-            "QOrderedLayerNormalization",
-            inputs=[
-                upstream_dequantize_node.input[0],
-                upstream_dequantize_node.input[1],
-                node.input[1],
-                node.input[2],
-                downstream_quantize_node.input[1],
-            ],
-            outputs=[downstream_quantize_node.output[0]],
-            name=self.model.create_node_name(
-                "QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization"
-            ),
-        )
-
-        # Arrange the downstream Shape's input to be fed from the
-        # downstream QuantizeLinear node, so that fusion will
-        # be deemed safe
-        if downstream_shape_node is not None:
-            self.model.replace_node_input(
-                downstream_shape_node,
-                downstream_shape_node.input[0],
-                downstream_quantize_node.output[0],
-            )
-
-        # TODO: We only support CuBlasLt order ORDER_ROW for now.
-        # Once we start supporting other data ordering format(s), we
-        # will support user configuring the data ordering for the op.
-        normalize_node.attribute.extend([helper.make_attribute("order_X", 1)])
-        normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)])
-
-        normalize_node.domain = "com.microsoft"
-
-        self.nodes_to_add.append(normalize_node)
-        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
deleted file mode 100644
index 8c8050e1cdfb0061b734b1224aa0006b1c09cdef..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionQOrderedMatMul(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "QOrderedMatMul", "MatMul")
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        matmul_children = self.model.get_children(node, input_name_to_nodes)
-
-        # Should only have 1 child - Bias Add
-        if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
-            return
-
-        bias_add_node = matmul_children[0]
-
-        # Atleast one of the inputs to Bias Add node must be a constant
-        bias_add_node_index = 0
-        if (
-            self.model.get_constant_value(bias_add_node.input[0]) is None
-            and self.model.get_constant_value(bias_add_node.input[1]) is None
-        ):
-            return
-
-        if self.model.get_constant_value(bias_add_node.input[0]) is None:
-            bias_add_node_index = 1
-
-        bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)
-
-        if len(bias_add_children) != 1:
-            return
-
-        bias_add_child = bias_add_children[0]
-
-        # Bias Add can have another Add downstream (Residual Add layer)
-        residual_add_node = None
-
-        downstream_quantize_node = None
-
-        if bias_add_child.op_type == "Add":
-            residual_add_node = bias_add_child
-
-            residual_add_children = self.model.get_children(
-                residual_add_node, input_name_to_nodes
-            )
-
-            if (
-                len(residual_add_children) != 1
-                or residual_add_children[0].op_type != "QuantizeLinear"
-            ):
-                return
-
-            downstream_quantize_node = residual_add_children[0]
-
-        elif bias_add_child.op_type == "QuantizeLinear":
-            downstream_quantize_node = bias_add_child
-
-        else:
-            return
-
-        # Make sure the downstream QuantizeLinear has the proper zero points and scales
-        if not FusionUtils.check_qdq_node_for_fusion(
-            downstream_quantize_node, self.model
-        ):
-            return
-
-        # The first input to MatMul should flow through a DequantizeLinear node
-        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
-            node,
-            [(["DequantizeLinear"], [0])],
-            output_name_to_node,
-        )
-
-        # If Attention is not fused, this is the pattern to look for
-        # leading upto the MatMul
-        reshape_node_0 = None
-        transpose_node_0 = None
-        if first_path_id < 0:
-            first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
-                node,
-                [
-                    (
-                        ["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"],
-                        [0, 0, 0, 0],
-                    )
-                ],
-                output_name_to_node,
-            )
-
-            if first_path_id < 0:
-                return
-
-            reshape_node_0 = first_input_parent_nodes[0]
-            transpose_node_0 = first_input_parent_nodes[1]
-            dequantize_node_0 = first_input_parent_nodes[2]
-        else:
-            dequantize_node_0 = first_input_parent_nodes[0]
-
-        # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
-        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
-            return
-
-        # The second input to MatMul should flow through a DequantizeLinear node
-        dequantize_node_1 = None
-        is_weight_transpose_required = True
-
-        weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
-            node,
-            [
-                (
-                    [
-                        "DequantizeLinear",
-                        "QuantizeLinear",
-                        "Transpose",
-                        "DequantizeLinear",
-                    ],
-                    [1, 0, 0, 0],
-                )
-            ],
-            output_name_to_node,
-        )
-
-        if weight_path_id < 0:
-            weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
-                node,
-                [(["DequantizeLinear"], [1])],
-                output_name_to_node,
-            )
-
-            if weight_path_id < 0:
-                return
-
-            dequantize_node_1 = weight_nodes[0]
-        else:
-            is_weight_transpose_required = False
-            dequantize_node_1 = weight_nodes[3]
-
-        # Check if weight 'B' is a constant
-        if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
-            return
-
-        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
-        # Per-channel scales are supported for weights alone
-        if not FusionUtils.check_qdq_node_for_fusion(
-            dequantize_node_1, self.model, False
-        ):
-            return
-
-        # Make sure the upstream flow into the Residual Add node flows through a DQ node
-        residual_add_dequantize_node = None
-
-        if residual_add_node is not None:
-            (
-                residual_path_id,
-                residual_input_parent_nodes,
-                _,
-            ) = self.model.match_parent_paths(
-                residual_add_node,
-                [
-                    (["DequantizeLinear"], [1]),
-                ],
-                output_name_to_node,
-            )
-
-            if residual_path_id < 0:
-                return
-
-            residual_add_dequantize_node = residual_input_parent_nodes[0]
-
-        # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
-        if (
-            residual_add_dequantize_node is not None
-            and not FusionUtils.check_qdq_node_for_fusion(
-                residual_add_dequantize_node, self.model
-            )
-        ):
-            return
-
-        # Subgraph nodes to be fused
-        subgraph_nodes = [node, bias_add_node]  # MatMul + Bias Add
-
-        if residual_add_node is not None:
-            subgraph_nodes.extend([residual_add_node])  # Residual Add
-
-        subgraph_nodes.extend(weight_nodes)
-        subgraph_nodes.extend([downstream_quantize_node])  # Downstream Q node
-
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes,
-            downstream_quantize_node.output,
-            input_name_to_nodes,
-            output_name_to_node,
-        ):
-            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
-            return
-
-        # Deal with the case where-in the Attention subgraph is not fused
-        if transpose_node_0 is not None:
-            self.model.replace_node_input(
-                transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0]
-            )
-
-        # Make inputs
-        fused_node_inputs = [
-            reshape_node_0.output[0]
-            if reshape_node_0 is not None
-            else dequantize_node_0.input[0],
-            dequantize_node_0.input[1],
-            dequantize_node_1.input[0],
-            dequantize_node_1.input[1],
-            downstream_quantize_node.input[1],
-            bias_add_node.input[bias_add_node_index],
-        ]
-
-        if residual_add_node is not None:
-            fused_node_inputs.append(residual_add_dequantize_node.input[0])
-            fused_node_inputs.append(residual_add_dequantize_node.input[1])
-
-        # The MatMul weight 'B' and 'bias' need some post-processing
-        # Transpose weight 'B' from order ROW to order COL
-        # This offline transpose is needed only while using the CUDA EP
-        # TODO: Make this fusion logic EP-agnostic ?
-        if is_weight_transpose_required:
-            weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
-            FusionUtils.transpose_2d_int8_tensor(weight_tensor)
-
-        fused_node = helper.make_node(
-            "QOrderedMatMul",
-            inputs=fused_node_inputs,
-            outputs=[downstream_quantize_node.output[0]],
-            name=self.model.create_node_name(
-                "QOrderedMatMul", name_prefix="QOrderedMatMul"
-            ),
-        )
-
-        fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
-        fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
-        fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])
-
-        fused_node.domain = "com.microsoft"
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
deleted file mode 100644
index 2a5bf73fdf07f223be18e7bbaf20f9623ebb3fdc..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-import numpy as np
-from onnx import TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionReshape(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "Reshape", "Reshape")
-        self.prune_graph: bool = False
-
-    def replace_reshape_node(self, shape, reshape_node, concat_node):
-        shape_value = np.asarray([int(x) if isinstance(x, np.ndarray) else x for x in shape], dtype=np.int64)
-        constant_shape_name = self.model.create_node_name("Constant", "constant_shape")
-        new_node = helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[constant_shape_name],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.INT64,
-                dims=shape_value.shape,
-                vals=bytes(shape_value),
-                raw=True,
-            ),
-        )
-        reshape_node.input[1] = constant_shape_name
-        reshape_node.name = self.model.create_node_name("Reshape", "Reshape_Fuse")
-        self.nodes_to_remove.extend([concat_node])
-        self.nodes_to_add.append(new_node)
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-    def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
-        if reshape_node.input[1] not in output_name_to_node:
-            return
-
-        concat_node = output_name_to_node[reshape_node.input[1]]
-        if (
-            concat_node.op_type != "Concat"
-            or len(concat_node.input) < 3
-            or len(concat_node.input) > 4
-        ):
-            return
-
-        path0 = self.model.match_parent_path(
-            concat_node,
-            ["Unsqueeze", "Gather", "Shape"],
-            [0, 0, 0],
-            output_name_to_node,
-        )
-        if path0 is None:
-            return
-
-        (unsqueeze_0, gather_0, shape_0) = path0
-
-        path1 = self.model.match_parent_path(
-            concat_node,
-            ["Unsqueeze", "Gather", "Shape"],
-            [1, 0, 0],
-            output_name_to_node,
-        )
-        if path1 is None:
-            return
-        (unsqueeze_1, gather_1, shape_1) = path1
-
-        shape = []
-        gather_value = self.model.get_constant_value(gather_0.input[1])
-        if gather_value == 0:
-            shape.append(0)
-
-        gather_value = self.model.get_constant_value(gather_1.input[1])
-        if gather_value == 1:
-            shape.append(0)
-
-        if len(shape) != 2:
-            return
-
-        path2 = []
-        path3 = []
-        shape_nodes = [shape_0, shape_1]
-        if (
-            len(concat_node.input) == 3
-            and self.model.get_initializer(concat_node.input[2]) is None
-        ):
-            path2 = self.model.match_parent_path(
-                concat_node,
-                ["Unsqueeze", "Mul", "Gather", "Shape"],
-                [2, 0, 0, 0],
-                output_name_to_node,
-            )
-            if path2 is None:
-                path2 = self.model.match_parent_path(
-                    concat_node,
-                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
-                    [2, 0, 0, 0, 0],
-                    output_name_to_node,
-                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
-                if path2 is None:
-                    return
-
-            path3 = self.model.match_parent_path(
-                concat_node,
-                ["Unsqueeze", "Mul", "Gather", "Shape"],
-                [2, 0, 1, 0],
-                output_name_to_node,
-            )
-            if path3 is None:
-                path3 = self.model.match_parent_path(
-                    concat_node,
-                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
-                    [2, 0, 1, 0, 0],
-                    output_name_to_node,
-                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
-                if path3 is None:
-                    return
-
-            shape_nodes.extend([path2[-1], path3[-1]])
-            shape.append(-1)
-        elif len(concat_node.input) > 2:
-            concat_2 = self.model.get_initializer(concat_node.input[2])
-            if concat_2 is None:
-                return
-            concat_value = numpy_helper.to_array(concat_2)
-            if isinstance(concat_value, list):
-                shape.extend(concat_value)
-            else:
-                shape.append(concat_value)
-
-        if (
-            len(concat_node.input) == 4
-            and self.model.get_initializer(concat_node.input[3]) is None
-        ):
-            if -1 in shape:
-                return
-
-            path2 = self.model.match_parent_path(
-                concat_node,
-                ["Unsqueeze", "Div", "Gather", "Shape"],
-                [3, 0, 0, 0],
-                output_name_to_node,
-            )
-            if path2 is None:
-                path2 = self.model.match_parent_path(
-                    concat_node,
-                    ["Unsqueeze", "Div", "Squeeze", "Slice", "Shape"],
-                    [3, 0, 0, 0, 0],
-                    output_name_to_node,
-                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
-                if path2 is None:
-                    return
-            shape_nodes.extend([path2[-1]])
-            shape.append(-1)
-        elif len(concat_node.input) > 3:
-            concat_3 = self.model.get_initializer(concat_node.input[3])
-            if concat_3 is None:
-                return
-
-            concat_value = numpy_helper.to_array(concat_3)
-            if isinstance(concat_value, list):
-                shape.extend(concat_value)
-            else:
-                shape.append(concat_value)
-
-        root_input = reshape_node.input[0]
-        same_shape_input = True
-        for shape_node in shape_nodes:
-            if shape_node.input[0] != root_input:
-                same_shape_input = False
-
-        if not same_shape_input:
-            return
-
-        self.replace_reshape_node(shape, reshape_node, concat_node)
-
-        # TODO(tlwu): Subgraph blocks pruning un-used nodes. Add code to remove un-used nodes safely.
-        self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
deleted file mode 100644
index b3ec51a5a25af26a36ef9fc0015b80104e4cd67f..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-import logging
-from typing import Dict
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = logging.getLogger(__name__)
-
-
-class FusionRMSNorm(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "RMSNorm", "Mul")
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        if node.op_type != "Mul":
-            return
-
-        sim_ln_nodes = None
-        # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary):
-        # DD = Pow(D, 2)
-        # Var = ReduceMean(DD)
-        # VarEps = Add(Var, epsilon)
-        # StdDev = Sqrt(VarEps)
-        # InvStdDev = Div(1, StdDev)
-        # Normalized = Mul(D, InvStdDev)
-        # NormalizedScaled = Mul(Normalized, Scale)
-
-        #                              RMSNorm
-        #          +-------------------------------------------------------+
-        #          |                                                       |
-        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
-        #                                                                  |
-        #                                                                 node
-        sim_ln_nodes_1 = self.model.match_parent_path(
-            node,
-            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
-            [1, 1, 1, 0, 0, 0, 0],
-        )
-        #                                RMSNorm
-        #             +-------------------------------------------------------+
-        #             |                                                       |
-        # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
-        #                                                                     |
-        #                                                                    node
-        sim_ln_nodes_2 = self.model.match_parent_path(
-            node,
-            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
-            [1, 1, 1, 0, 0, 0, 0],
-        )
-
-        # For LLaMA from Microsoft custom export:
-        # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1
-        #
-        #                              RMSNorm
-        #          +-------------------------------------------------------+
-        #          |                                                       |
-        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
-        #                                                                  |
-        #                                                                 node
-        sim_ln_nodes_3 = self.model.match_parent_path(
-            node,
-            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
-            [0, 1, 1, 0, 0, 0, 0],
-        )
-
-        # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3
-        #
-        #                                  RMSNorm
-        #                  +-----------------------------------------------+
-        #                  |                                               |
-        # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul
-        #                                                                  |
-        #                                                                 node
-        sim_ln_nodes_4 = self.model.match_parent_path(
-            node,
-            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"],
-            [0, 1, 1, 0, 0, 0],
-        )
-
-        add_node, pow_node = None, None
-        if sim_ln_nodes_1 is not None:
-            sim_ln_nodes = sim_ln_nodes_1
-            add_node = sim_ln_nodes[3]
-            pow_node = sim_ln_nodes[-2]
-        elif sim_ln_nodes_2 is not None:
-            sim_ln_nodes = sim_ln_nodes_2
-            add_node = sim_ln_nodes[3]
-            pow_node = sim_ln_nodes[-2]
-        elif sim_ln_nodes_3 is not None:
-            sim_ln_nodes = sim_ln_nodes_3
-            add_node = sim_ln_nodes[3]
-            pow_node = sim_ln_nodes[-2]
-        elif sim_ln_nodes_4 is not None:
-            sim_ln_nodes = sim_ln_nodes_4
-            add_node = sim_ln_nodes[3]
-            pow_node = sim_ln_nodes[-1]
-            # Verify that parent input to Pow node is graph_input
-            if pow_node.input[0] not in self.model.get_graphs_input_names():
-                return
-        else:
-            return
-
-        layernorm_weight_index = (
-            1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0
-        )
-        starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4
-
-        if self.model.find_constant_input(pow_node, 2.0) != 1:
-            return
-
-        root_input = pow_node.input[0]
-        if root_input != sim_ln_nodes[0].input[0]:
-            return
-
-        i, add_weight = self.model.get_constant_input(add_node)
-        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
-            logger.warning(f"epsilon value is not expected: {add_weight}")
-            return
-
-        self.nodes_to_remove.extend(
-            sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes
-        )
-        self.nodes_to_remove.append(node)
-
-        normalize_node = helper.make_node(
-            "RMSNormPluginDynamic_IxRT",
-            inputs=[root_input, node.input[layernorm_weight_index]],
-            outputs=[node.output[0]],
-            name=self.model.create_node_name(
-                "RMSNormPluginDynamic_IxRT", name_prefix="RMSNorm_"
-            ),
-        )
-
-        normalize_node.domain = "com.iluvatar"
-        normalize_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        normalize_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        normalize_node.attribute.extend(
-            [helper.make_attribute("epsilon", float(add_weight))]
-        )
-        normalize_node.attribute.extend([helper.make_attribute("axis", -1)])
-        normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
-        gamma_data = self.model.get_initializer(normalize_node.input[1])
-        gamma_data_np = NumpyHelper.to_array(gamma_data)
-        normalize_node.attribute.extend(
-            [helper.make_attribute("hidden_size", int(gamma_data_np.shape[0]))]
-        )
-
-        normalize_node.attribute.extend([helper.make_attribute("gamma", gamma_data)])
-
-        self.nodes_to_add.append(normalize_node)
-        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
deleted file mode 100644
index 1d99595e8e8d9dc1cde4da1c66f266251d0919ca..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionRoformerCrossAttention(Fusion):
-    """
-    Fuse VideoBertAttention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQkvCrossToContext_IxRT",
-            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(
-        self, custom_fc: NodeProto, mul: NodeProto
-    ) -> Tuple[int, int]:
-        mul_initializer = self.model.get_initializer(mul.input[1])
-
-        # 检查float_data是否为空
-        if len(mul_initializer.float_data) > 0:
-            mul_value = mul_initializer.float_data[0]
-        else:
-            # 如果float_data为空，尝试其他方式获取数据
-            # 例如，如果数据存储在raw_data中
-            if len(mul_initializer.raw_data) > 0:
-                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
-                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
-            else:
-                raise ValueError("Data not found in the mul_initializer")
-
-        for attr in custom_fc.attribute:
-            if attr.name == "W":
-                tensor_value = attr.t
-                tensor_shape = [dim for dim in tensor_value.dims]
-                break
-        head_dim = math.floor(1.0 / (mul_value * mul_value))
-        hidden_size = tensor_shape[0]
-        num_heads = hidden_size // head_dim
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self,
-        num_heads: int,
-        hidden_size: int,
-        input_q: str,
-        input_k: str,
-        input_v: str,
-        input_mask: str,
-        output: str,
-        matmul_qk_add: NodeProto,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input_q: str,
-            input_k: str,
-            input_v: str,
-            input_mask: str,
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        attention_node_name = self.model.create_node_name("CrossAttention")
-
-        attention_inputs = [input_q, input_k, input_v, input_mask]
-
-        attention_node = helper.make_node(
-            "CustomQkvCrossToContext_IxRT",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
-        attention_node.attribute.extend([helper.make_attribute("type_mask", 4)])  #3:float mask 4:int32 mask
-        attention_node.attribute.extend([helper.make_attribute("scale", 1.0 / 8)]) #1 /sqrt(num_heads)
-        
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        
-        return attention_node
-
-    def get_shape(self, edge_name):
-        for info in self.model.graph().value_info:
-            if info.name == edge_name:
-                return info.type.tensor_type.shape.dim
-        return None
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-
-        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
-        qkv_paths = {
-            "path1": (
-                [
-                    "CustomFCPluginDynamic_IxRT",
-                    "Reshape",
-                    "Transpose",
-                    "Reshape",
-                    "MatMul",
-                ],
-                [0, 0, 0, 0, 0],
-            ),
-            "path2": (
-                [
-                    "CustomFCPluginDynamic_IxRT",
-                    "Reshape",
-                    "Transpose",
-                    "Reshape",
-                    "MatMul",
-                ],
-                [1, 0, 0, 0, 0],
-            ),
-        }
-        # print('start_nodes:', start_node.name)
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-
-        fc_after_atten = None
-        if qkv_path in ["path1", "path2"]:
-            (
-                fc_after_atten,
-                reshape_qkv_2,
-                transpose_qkv,
-                reshape_qkv_1,
-                matmul_qkv,
-            ) = qkv_nodes
-
-        """
-        Match
-        Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
-         |                                                        |
-         |                                                        |
-         +---------------------------------------------------------
-        """
-        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
-        if add_before_layernorm is not None:
-            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        v_paths = {"path1": (["Reshape", "Transpose", "Reshape"], [1, 0, 0])}
-
-        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-        if v_path == "path1":
-            (reshape_v, transpose_v, v_reshape) = v_nodes
-
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-
-        qk_paths = {
-            "path1": (
-                ["Softmax", "Add", "Mul", "Mul", "Reshape", "MatMul"],
-                [0, 0, None, None, None, 0],
-            )
-        }
-
-        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return
-        # print('qk_nodes', qk_nodes[0].name)
-        matmul_qk_add = None
-        if qk_path == "path1":
-            (_, add_mask, mul_mask, mul_qk, reshape_qk, matmul_qk) = qk_nodes
-
-        q_paths = {
-            "path1": (["Transpose", "Add"], [0, 0]),
-        }
-        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-        # print('q_nodes', q_nodes[0].name)
-        if q_path == "path1":
-            (q_tranpose, q_add) = q_nodes
-
-        k_paths = {
-            "path1": (["Reshape", "Transpose", "Add"], [1, 0, 0]),
-        }
-        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-
-        if k_nodes is None:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-        # print('k_nodes', k_nodes[0].name)
-        if k_path == "path1":
-            (_, k_transpose, k_add) = k_nodes
-        # print('add_mask', add_mask.name)
-        mask_paths = {
-            "path1": (
-                ["Mul", "Sub", "Unsqueeze", "Cast", "Greater"],
-                [1, None, 1, 0, 0],
-            )
-        }
-        mask_nodes, mask_path = self.match_parent_path_from_dict(add_mask, mask_paths)
-
-        if mask_nodes is None:
-            logger.debug("fuse_attention: failed to match mask path")
-            return
-        # print('mask_nodes', mask_nodes[0].name)
-        (_, mask_sub, mask_unsqueeze, mask_cast, mask_greater) = mask_nodes
-
-        if (
-            self.get_shape(q_add.output[0]) == self.get_shape(k_add.output[0])
-            and self.get_shape(k_add.output[0]) == self.get_shape(v_reshape.output[0])
-            and mul_mask.input[1] in mask_unsqueeze.output
-        ):
-            attention_last_node = reshape_qkv_1
-
-            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
-                fc_after_atten, mul_qk
-            )
-
-            q_transpose_type = None
-            q_transpose_name = None
-            for info in self.model.graph().value_info:
-                if info.name == q_tranpose.output[0]:
-                    q_transpose_type = info.type
-                    q_transpose_name = info.name
-                    break
-
-            q_transpose_output = helper.make_value_info(
-                q_transpose_name[:-2] + "_fake_q", q_transpose_type
-            )
-            q_transpose_node = helper.make_node(
-                "Transpose",
-                inputs=[q_add.output[0]],
-                outputs=[q_transpose_output.name],
-                name=q_transpose_output.name,
-            )
-            q_transpose_node.attribute.extend(
-                [helper.make_attribute("perm", [0, 2, 1, 3])]
-            )
-
-            k_transpose_output = helper.make_value_info(
-                q_transpose_name[:-2] + "_fake_k", q_transpose_type
-            )
-            k_transpose_node = helper.make_node(
-                "Transpose",
-                inputs=[k_add.output[0]],
-                outputs=[k_transpose_output.name],
-                name=k_transpose_output.name,
-            )
-            k_transpose_node.attribute.extend(
-                [helper.make_attribute("perm", [0, 2, 1, 3])]
-            )
-
-            v_transpose_output = helper.make_value_info(
-                q_transpose_name[:-2] + "_fake_v", q_transpose_type
-            )
-            v_transpose_node = helper.make_node(
-                "Transpose",
-                inputs=[v_reshape.output[0]],
-                outputs=[v_transpose_output.name],
-                name=v_transpose_output.name,
-            )
-            v_transpose_node.attribute.extend(
-                [helper.make_attribute("perm", [0, 2, 1, 3])]
-            )
-
-            mask_type = None
-            for info in self.model.graph().value_info:
-                if info.name == mask_sub.output[0]:
-                    mask_type = info.type
-                    break
-
-            new_mask_type = onnx.TypeProto()
-            new_mask_type.tensor_type.elem_type = onnx.TensorProto.INT32
-            for dim in mask_type.tensor_type.shape.dim:
-                new_dim = new_mask_type.tensor_type.shape.dim.add()
-                new_dim.CopyFrom(dim)
-
-            mask_cast_to_int32_output = helper.make_value_info(
-                mask_sub.name + "_cast_to_int32", new_mask_type
-            )
-            mask_cast_to_int32_node = helper.make_node(
-                "Cast",
-                inputs=[mask_sub.output[0]],
-                outputs=[mask_cast_to_int32_output.name],
-                name=mask_cast_to_int32_output.name,
-            )
-            mask_cast_to_int32_node.attribute.extend([helper.make_attribute("to", 6)])
-
-            new_node = self.create_attention_node(
-                num_heads,
-                hidden_size,
-                q_transpose_node.output[0],
-                k_transpose_node.output[0],
-                v_transpose_node.output[0],
-                mask_cast_to_int32_node.output[0],
-                attention_last_node.output[0],
-                matmul_qk_add,
-            )
-            if new_node is None:
-                return
-
-            self.nodes_to_add.extend(
-                [
-                    q_transpose_node,
-                    k_transpose_node,
-                    v_transpose_node,
-                    new_node,
-                    mask_cast_to_int32_node,
-                ]
-            )
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-            self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name
-            self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name
-            self.node_name_to_graph_name[v_transpose_node.name] = self.this_graph_name
-            self.node_name_to_graph_name[
-                mask_cast_to_int32_node.name
-            ] = self.this_graph_name
-
-            self.nodes_to_remove.extend(qkv_nodes[3:])
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes[:-1])
-            self.nodes_to_remove.extend(k_nodes[:-1])
-            self.nodes_to_remove.extend(v_nodes[:-1])
-            self.nodes_to_remove.extend([mask_nodes[0]])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
deleted file mode 100644
index dfa14d0e25951f7ce72c719c452ebb56232e14a7..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionRoPE(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "CustomRoPEPluginDynamic_IxRT", "Add")
-
-    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
-        src_paths = {"path1": (["Mul", "Concat", "Split", "Slice"], [0, 1, None, 0])}
-        src_nodes, src_path = self.match_parent_path_from_dict(start_node, src_paths)
-        if src_nodes is None:
-            logger.debug("fuse_rope: failed to match src_node")
-            return
-
-        src_node = src_nodes[0]
-
-        rotate_paths = {"path1": (["Mul", "Reshape", "Concat"], [1, 0, 0])}
-        rotate_nodes, rotate_path = self.match_parent_path_from_dict(
-            start_node, rotate_paths
-        )
-
-        if rotate_nodes is None:
-            logger.debug("fuse_rope: failed to match rotate_path")
-            return
-
-        concat_node = rotate_nodes[-1]
-        mul_right_node = rotate_nodes[0]
-
-        odd_paths = {"path1": (["Unsqueeze", "Neg", "Slice", "Reshape"], [0, 0, 0, 0])}
-        odd_nodes, odd_path = self.match_parent_path_from_dict(concat_node, odd_paths)
-
-        if odd_nodes is None:
-            logger.debug("fuse_rope: failed to match odd_path")
-            return
-
-        even_paths = {"path1": (["Unsqueeze", "Slice", "Reshape"], [1, 0, 0])}
-        even_nodes, even_path = self.match_parent_path_from_dict(
-            concat_node, even_paths
-        )
-
-        if even_nodes is None:
-            logger.debug("fuse_rope: failed to match even_path")
-            return
-        reshape_node = even_nodes[-1]
-
-        if reshape_node.output[0] == src_node.input[0]:
-            rope_node_name = self.model.create_node_name("RoPE")
-            rope_node = helper.make_node(
-                "CustomRoPEPluginDynamic_IxRT",
-                inputs=[
-                    reshape_node.output[0],
-                    src_nodes[0].input[1],
-                    mul_right_node.input[1],
-                ],
-                outputs=[start_node.output[0]],
-                name=rope_node_name,
-            )
-            rope_node.domain = "com.iluvatar"
-            rope_node.attribute.extend([helper.make_attribute("type_id", 2)])
-            rope_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-            rope_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-
-            self.nodes_to_add.append(rope_node)
-            self.node_name_to_graph_name[rope_node.name] = self.this_graph_name
-
-            self.nodes_to_remove.extend([start_node])
-            self.nodes_to_remove.extend([src_nodes[0]])
-            self.nodes_to_remove.extend(rotate_nodes)
-            self.nodes_to_remove.extend(odd_nodes[:-1])
-            self.nodes_to_remove.extend(even_nodes[:-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
deleted file mode 100644
index 727d4b82d44805f6d52c8e7fd72d94acf846e73e..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Dict, List, Union
-
-from onnx import NodeProto, TensorProto
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionShape(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "Shape", "Concat")
-        self.utils = FusionUtils(model)
-        self.shape_infer = None
-        self.shape_infer_done = False
-
-    def get_dimensions_from_tensor_proto(
-        self, tensor_proto: TensorProto
-    ) -> Union[int, None]:
-        if tensor_proto.type.tensor_type.HasField("shape"):
-            return len(tensor_proto.type.tensor_type.shape.dim)
-        else:
-            return None
-
-    def get_dimensions(self, input_name: str) -> Union[int, None]:
-        graph_input = self.model.find_graph_input(input_name)
-        if graph_input:
-            return self.get_dimensions_from_tensor_proto(graph_input)
-
-        if not self.shape_infer_done:
-            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
-            self.shape_infer_done = True
-
-        if self.shape_infer is not None:
-            return self.get_dimensions_from_tensor_proto(
-                self.shape_infer.known_vi_[input_name]
-            )
-
-        return None
-
-    def fuse(
-        self,
-        concat_node: NodeProto,
-        input_name_to_nodes: Dict[str, List[NodeProto]],
-        output_name_to_node: Dict[str, NodeProto],
-    ):
-        """
-        Smplify subgraph like
-
-                   (2d_input)
-                    /       \
-                Shape       shape
-                /             \
-            Gather(indices=0)  Gather(indices=1)
-                |                |
-            Unsqueeze(axes=0)   Unsqueeze(axes=0)
-                   \          /
-                      Concat
-                        |
-
-        into  (2d_input) --> Shape -->
-        """
-        opset_version = self.model.get_opset_version()
-
-        inputs = len(concat_node.input)
-        root = None
-        shape_output = None
-        for i in range(inputs):
-            path = self.model.match_parent_path(
-                concat_node,
-                ["Unsqueeze", "Gather", "Shape"],
-                [i, 0, 0],
-                output_name_to_node,
-            )
-            if path is None:
-                return
-
-            unsqueeze, gather, shape = path
-            if i == 0:
-                shape_output = shape.output[0]
-            if root is None:
-                root = shape.input[0]
-                if self.get_dimensions(root) != inputs:
-                    return
-            elif shape.input[0] != root:
-                return
-
-            if not FusionUtils.check_node_attribute(
-                unsqueeze, "axis", 0, default_value=0
-            ):
-                return
-
-            if opset_version < 13:
-                if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
-                    return
-            else:
-                if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
-                    return
-
-            value = self.model.get_constant_value(gather.input[1])
-            from numpy import array_equal, ndarray
-
-            if not (
-                isinstance(value, ndarray) and value.size == 1 and value.item() == i
-            ):
-                return
-
-        if self.model.find_graph_output(concat_node.output[0]) is None:
-            self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output)
-            self.fused_count += 1
-            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
deleted file mode 100644
index d0797b26dc6edfabd91f4bd9d07d0c1da383ef8b..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-
-from onnx import helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSkipLayerNormalization(Fusion):
-    """
-    Fuse Add + LayerNormalization into one node: SkipLayerNormalization
-    Note: This fusion does not check the input shape of Add and LayerNormalization.
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"
-        )
-        # Update shape inference is needed since other fusions might add new edge which does not have shape info yet.
-        self.shape_infer_helper = self.model.infer_runtime_shape(
-            {"batch_size": 4, "seq_len": 7}, update=True
-        )
-
-        if self.shape_infer_helper is None:
-            # TODO(tianleiwu): support subgraph in shape inference or add broadcasting in SkipLayerNormalization op.
-            logger.warning("symbolic shape inference disabled or failed.")
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        add = self.model.get_parent(node, 0, output_name_to_node)
-
-        # In some models there is input_ids->gather->add->LayerNorm and one of input of the
-        # add node is initializer with fixed shape which should not be fused into SkipLayerNorm
-        if add is None:
-            return
-
-        for add_input in add.input:
-            if self.model.get_initializer(add_input) != None:
-                return
-
-        # The number of input node of add should be 2
-        if len(self.model.get_parents(add)) != 2:
-            return
-
-        if self.shape_infer_helper is not None:
-            if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]):
-                logger.debug(
-                    "skip SkipLayerNormalization fusion since shape of inputs (%s, %s) are not same",
-                    add.input[0],
-                    add.input[1],
-                )
-                return
-        else:
-            layernorm_weight = self.model.get_initializer(node.input[1])
-            if layernorm_weight is not None:
-                layernorm_weight_arr = NumpyHelper.to_array(layernorm_weight)
-                hidden_size = layernorm_weight_arr.shape[0]
-            else:
-                logger.debug(
-                    "skip SkipLayerNormalization fusion since symbolic shape inference failed"
-                )
-                return
-
-        # gather_path = self.model.match_parent_path(add, ["Gather"], [None])
-        # if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
-        #     if self.model.match_parent_path(gather_path[0], ["ConstantOfShape"], [1]) is None:
-        #         return
-
-        if (
-            add is not None
-            and add.op_type == "Add"
-            and self.model.is_safe_to_fuse_nodes(
-                [add, node], node.output, input_name_to_nodes, output_name_to_node
-            )
-        ):
-            self.nodes_to_remove.extend([add, node])
-
-            inputs = [add.input[0], add.input[1]]
-            normalize_node = helper.make_node(
-                "CustomSkipLayerNormPluginDynamic_IxRT",
-                inputs=inputs,
-                outputs=[node.output[0]],
-                name=self.model.create_node_name(
-                    "SkipLayerNormalization", name_prefix="SkipLayerNorm"
-                ),
-            )
-            normalize_node.domain = "com.iluvatar"
-            if self.shape_infer_helper is not None:
-                hidden_size = self.shape_infer_helper.get_edge_shape(node.input[1])[-1]
-            normalize_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
-            normalize_node.attribute.extend([helper.make_attribute("type_id", 2)])
-            normalize_node.attribute.extend(
-                [
-                    helper.make_attribute(
-                        "beta", self.model.get_initializer(node.input[2])
-                    )
-                ]
-            )
-            normalize_node.attribute.extend(
-                [
-                    helper.make_attribute(
-                        "gamma", self.model.get_initializer(node.input[1])
-                    )
-                ]
-            )
-            normalize_node.attribute.extend(
-                [helper.make_attribute("plugin_namespace", "")]
-            )
-            normalize_node.attribute.extend(
-                [helper.make_attribute("plugin_version", "1")]
-            )
-
-            self.nodes_to_add.append(normalize_node)
-            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionBiasSkipLayerNormalization(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model,
-            "CustomSkipLayerNormPluginDynamic_IxRT",
-            "SkipLayerNormalization",
-            "add bias",
-        )
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        if len(node.input) != 4:
-            return
-
-        return_indice = []
-        nodes = self.model.match_parent_path(
-            node, ["Add", "MatMul"], [None, None], None, return_indice
-        )
-        if nodes is None:
-            return
-        assert len(return_indice) == 2
-        add_input_index = return_indice[0]
-        if add_input_index >= 2:
-            return
-
-        (add, matmul) = nodes
-
-        # bias should be one dimension
-        bias_index = -1
-        for i, input in enumerate(add.input):
-            initializer = self.model.get_initializer(input)
-            if initializer is None:
-                continue
-            bias_index = i
-            bias_weight = NumpyHelper.to_array(initializer)
-            break
-        if bias_weight is None:
-            logger.debug(f"Bias weight not found")
-            return
-        if len(bias_weight.shape) != 1:
-            logger.debug(f"Bias weight is not 1D")
-            return
-
-        subgraph_nodes = [node, add]
-        if not self.model.is_safe_to_fuse_nodes(
-            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
-        ):
-            logger.debug(
-                f"Skip fusing SkipLayerNormalization with Bias since it is not safe"
-            )
-            return
-
-        self.nodes_to_remove.extend(subgraph_nodes)
-        inputs = [
-            node.input[1 - add_input_index],
-            matmul.output[0],
-            node.input[2],
-            node.input[3],
-            add.input[bias_index],
-        ]
-        new_node = helper.make_node(
-            "CustomSkipLayerNormPluginDynamic_IxRT",
-            inputs=inputs,
-            outputs=node.output,
-            name=self.model.create_node_name(
-                "SkipLayerNormalization", "SkipLayerNorm_AddBias_"
-            ),
-        )
-        new_node.domain = "com.iluvatar"
-        hidden_size = self.shape_infer_helper.get_edge_shape(node.input[2])[-1]
-        new_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
-        new_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        new_node.attribute.extend(
-            [helper.make_attribute("beta", self.model.get_initializer(node.input[3]))]
-        )
-        new_node.attribute.extend(
-            [helper.make_attribute("gamma", self.model.get_initializer(node.input[2]))]
-        )
-        new_node.attribute.extend(
-            [
-                helper.make_attribute(
-                    "bias", self.model.get_initializer(add.input[bias_index])
-                )
-            ]
-        )
-        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-
-        self.nodes_to_add.append(new_node)
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
deleted file mode 100644
index 436257c3ce09b25790b132b6f918afebc63d9380..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSplitQKV(Fusion):
-    """
-    Fuse FusionSplitQKV
-    """
-
-    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
-        super().__init__(model, "SplitQKV_IxRT", "MatMul")
-
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-
-    def create_splitqkv_node(
-        self, input: str, query_out: str, key_out: str, value_out: str
-    ) -> Union[NodeProto, None]:
-        """Create an XSoftmax node.
-
-        Args:
-            data_input (str): data input name
-            mask_input (str): max input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        node_name = self.model.create_node_name("SplitQKV_IxRT")
-
-        new_node = helper.make_node(
-            "SplitQKV_IxRT",
-            inputs=[input],
-            outputs=[query_out, key_out, value_out],
-            name=node_name,
-        )
-        new_node.domain = "com.iluvatar"
-        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        new_node.attribute.extend(
-            [helper.make_attribute("atten_scale", 1 / self.num_heads)]
-        )
-
-        return new_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        split_query_paths = {
-            "query_path": (
-                ["Div", "Transpose", "Reshape", "Slice", "CustomFCPluginDynamic_IxRT"],
-                [0, 0, 0, 0, 0],
-            ),
-        }
-
-        split_key_paths = {
-            "key_path": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
-        }
-
-        q_nodes, q_path = self.match_parent_path_from_dict(node, split_query_paths)
-
-        k_nodes, k_path = self.match_parent_path_from_dict(node, split_key_paths)
-
-        if (q_nodes is not None) and (k_nodes is not None):
-            (
-                q_div_node,
-                q_transpose_node,
-                q_reshape_node,
-                q_slice_node,
-                coustom_fc_node,
-            ) = q_nodes
-            k_transpose_node, k_reshape_node, k_slice_node = k_nodes
-            slice_nodes = self.model.get_children(coustom_fc_node)
-
-            if len(slice_nodes) != 3:
-                return
-            slice_nodes.remove(q_slice_node)
-            slice_nodes.remove(k_slice_node)
-            v_slice_node = slice_nodes[0]
-
-            node.input[0] = q_div_node.input[0]  # dele div
-            new_node = self.create_splitqkv_node(
-                coustom_fc_node.output[0],
-                q_slice_node.output[0],
-                k_slice_node.output[0],
-                v_slice_node.output[0],
-            )
-
-            self.nodes_to_add.append(new_node)
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-            self.nodes_to_remove.append(q_slice_node)
-            self.nodes_to_remove.append(k_slice_node)
-            self.nodes_to_remove.append(v_slice_node)
-            self.nodes_to_remove.append(q_div_node)
-
-        else:
-            return
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py
deleted file mode 100644
index 4152eef6e6371dd4da27b5315bf5bd741d0749d1..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV_update_KVcache.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionSplitQKVUpdateKVCache(Fusion):
-    """
-    Fuse FusionSplitQKVUpdateKVCache
-    """
-
-    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
-        super().__init__(
-            model, "SplitQKVUpdateKVCache_IxRT", "CustomQkvCrossToContext_IxRT"
-        )
-
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-
-    def create_node(
-        self,
-        inputs: list,
-        outputs: list,
-    ) -> Union[NodeProto, None]:
-        """Create an XSoftmax node.
-
-        Args:
-            data_input (str): data input name
-            mask_input (str): max input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        node_name = self.model.create_node_name("SplitQKVUpdateKVCache_IxRT")
-
-        new_node = helper.make_node(
-            "SplitQKVUpdateKVCache_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=node_name,
-        )
-        new_node.domain = "com.iluvatar"
-        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        new_node.attribute.extend([helper.make_attribute("num_head", self.num_heads)])
-        new_node.attribute.extend(
-            [helper.make_attribute("head_dim", self.hidden_size // self.num_heads)]
-        )
-
-        return new_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        query_paths = {
-            "query_path": (
-                ["Transpose", "Reshape", "Split"],
-                [0, 0, None],
-            ),
-        }
-
-        key_paths = {
-            "key_path": (
-                ["Concat", "Transpose", "Reshape", "Split"],
-                [1, None, 0, None],
-            ),
-        }
-
-        value_paths = {
-            "value_path": (
-                ["Concat", "Transpose", "Reshape", "Split"],
-                [2, None, 0, None],
-            ),
-        }
-
-        q_nodes, q_path = self.match_parent_path_from_dict(node, query_paths)
-
-        k_nodes, k_path = self.match_parent_path_from_dict(node, key_paths)
-
-        v_nodes, v_path = self.match_parent_path_from_dict(node, value_paths)
-
-        if (q_nodes is not None) and (k_nodes is not None) and (v_nodes is not None):
-            (q_transpose_node, q_reshape_node, q_split_node) = q_nodes
-            (k_concat_node, k_transpose_node, k_reshape_node, k_split_node) = k_nodes
-
-            (v_concat_node, v_transpose_node, v_reshape_node, v_split_node) = v_nodes
-
-            inputs = [
-                q_split_node.input[0],
-                k_concat_node.input[0],
-                v_concat_node.input[0],
-            ]
-
-            outputs = [
-                q_transpose_node.output[0],
-                k_concat_node.output[0],
-                v_concat_node.output[0],
-            ]
-
-            new_node = self.create_node(inputs, outputs)
-
-            self.nodes_to_add.append(new_node)
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-            self.nodes_to_remove.append(q_transpose_node)
-            self.nodes_to_remove.append(q_reshape_node)
-            self.nodes_to_remove.append(q_split_node)
-
-            self.nodes_to_remove.append(k_concat_node)
-            self.nodes_to_remove.append(k_transpose_node)
-            self.nodes_to_remove.append(k_reshape_node)
-
-            self.nodes_to_remove.append(v_concat_node)
-            self.nodes_to_remove.append(v_transpose_node)
-            self.nodes_to_remove.append(v_reshape_node)
-
-        else:
-            return
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
deleted file mode 100644
index e446a69a636ed38e6e869a15ba6196d727b6d855..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import List, Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-def get_tensor_attr(attrs, attr_name):
-    result = None
-    for i in attrs:
-        if i.name == attr_name:
-            return numpy_helper.to_array(i.t)
-    return result
-
-
-class FusionSwinLAttention(Fusion):
-    """
-    Fuse SwinL subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQKVToContextPluginDynamic_IxRT",
-            ["CustomFCPluginDynamic_IxRT"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(self, reshape_v: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        v_shape = self.model.get_initializer(reshape_v.input[1])
-        if v_shape is None:
-            logger.debug(f"{reshape_v.input[1]} is not initializer.")
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        v_shape_value = NumpyHelper.to_array(v_shape)
-        if len(v_shape_value) != 3 or (v_shape_value[1] <= 0 or v_shape_value[2] <= 0):
-            logger.debug(
-                f"v_shape_value={v_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return self.num_heads, self.hidden_size  # Fall back to user specified value
-
-        num_heads = 1
-        for value_info in self.model.graph().value_info:
-            if value_info.name == reshape_v.input[0]:
-                num_heads = value_info.type.tensor_type.shape.dim[2].dim_value
-                break
-        hidden_size = v_shape_value[2]
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self,
-        num_heads: int,
-        hidden_size: int,
-        inputs: List[str],
-        output: str,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
-        return attention_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        self.fuse_pattern1(normalize_node, input_name_to_nodes, output_name_to_node)
-        self.fuse_pattern2(normalize_node, input_name_to_nodes, output_name_to_node)
-
-    def fuse_pattern2(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        """match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC"""
-        logger.debug("fuse swin-L attention pass")
-        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
-        start_node = normalize_node
-        qkv_paths = {
-            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
-        }
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-        assert qkv_path == "path1", "abnormal qkv path"
-        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
-
-        # 2. MatMul as start, go up to find v path
-        v_paths = {
-            "path1": (
-                ["Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
-                [None, 0, 0],
-            )
-        }
-        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-        if not v_nodes:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        assert v_path == "path1", "abnormal v path"
-
-        # 3. MatMul as start, go up to find q,k paths
-        # q path
-        q_paths = {
-            "path1": (
-                [
-                    "Softmax",
-                    "Add",
-                    "Div",
-                    "MatMul",
-                    "Transpose",
-                    "Reshape",
-                    "CustomFCPluginDynamic_IxRT",
-                ],
-                [None, 0, 0, 0, 0, 0, 0],
-            ),
-        }
-        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
-        if not q_nodes:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-        assert q_path == "path1", "abnormal q paths found"
-
-        # get Add(bias) input name as fused Attention inputs
-        add_op, div_op = q_nodes[1], q_nodes[2]
-        relative_position_bias_name = (
-            add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
-        )
-
-        # k path
-        k_paths = {
-            "path2": (
-                [
-                    "Softmax",
-                    "Add",
-                    "Div",
-                    "MatMul",
-                    "Transpose",
-                    "Reshape",
-                    "CustomFCPluginDynamic_IxRT",
-                ],
-                [None, 0, 0, 0, 1, 0, 0],
-            )
-        }
-        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
-        if not k_nodes:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-        assert k_path == "path2", "abnormal k paths found"
-        # 4. Fuse 3 CustomFC into one, and fuse attention
-        # Fuse FCs
-        fc_nodes = [q_nodes[-1], k_nodes[-1], v_nodes[-1]]
-        weight = self.fuse_tensor_in_node_attrs(
-            fc_nodes, "W", q_nodes[-1].name + "_Weight"
-        )
-        bias = self.fuse_tensor_in_node_attrs(fc_nodes, "B", q_nodes[-1].name + "_Bias")
-        fused_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[q_nodes[-1].input[0]],
-            outputs=q_nodes[-1].output,
-            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend(
-            [helper.make_attribute("out_dims", numpy_helper.to_array(bias).shape[0])]
-        )
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        fused_node.attribute.extend([helper.make_attribute("W", weight)])
-        fused_node.attribute.extend([helper.make_attribute("B", bias)])
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-
-        # Fuse Attention
-        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
-        attention_node = self.create_attention_node(
-            num_heads,
-            hidden_size,
-            [fused_node.output[0], relative_position_bias_name],
-            reshape_qkv.output[0],
-        )
-        if not attention_node:
-            return
-        self.nodes_to_add.append(attention_node)
-        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-        self.nodes_to_remove.extend(
-            [*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes]
-        )
-        self.prune_graph = True
-
-    def fuse_pattern1(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        """match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC"""
-        logger.debug("fuse swin-L attention pass")
-        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
-        start_node = normalize_node
-        qkv_paths = {
-            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
-        }
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-        assert qkv_path == "path1", "abnormal qkv path"
-        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
-
-        # 2. MatMul as start, go up to find v path
-        v_paths = {
-            "path1": (
-                ["Transpose", "Reshape", "Add", "Split", "MatMul"],
-                [None, 0, 0, None, 0],
-            )
-        }
-        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-        if not v_nodes:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-        assert v_path == "path1", "abnormal v path"
-
-        # 3. MatMul as start, go up to find q,k paths
-        # q path
-        q_paths = {
-            "path1": (
-                [
-                    "Softmax",
-                    "Add",
-                    "Div",
-                    "MatMul",
-                    "Transpose",
-                    "Reshape",
-                    "Add",
-                    "Split",
-                    "MatMul",
-                ],
-                [None, 0, 0, 0, 0, 0, 0, None, 0],
-            ),
-        }
-        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
-        if not q_nodes:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-        assert q_path == "path1", "abnormal q paths found"
-
-        # get Add(bias) input name as fused Attention inputs
-        add_op, div_op = q_nodes[1], q_nodes[2]
-        relative_position_bias_name = (
-            add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
-        )
-
-        # k path
-        k_paths = {
-            "path2": (
-                [
-                    "Softmax",
-                    "Add",
-                    "Div",
-                    "MatMul",
-                    "Transpose",
-                    "Reshape",
-                    "Add",
-                    "Split",
-                    "MatMul",
-                ],
-                [None, 0, 0, 0, 1, 0, 0, None, 0],
-            )
-        }
-        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
-        if not k_nodes:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-        assert k_path == "path2", "abnormal k paths found"
-        # 4. Attention and CustomFC have been found, now transform the found nodes to two plugin nodes
-        # Test 3 paths have the same origin
-        is_same_origin = q_nodes[-1] is k_nodes[-1] is v_nodes[-1]
-        is_same_origin &= q_nodes[-2] is k_nodes[-2] is v_nodes[-2]
-        is_same_origin &= q_nodes[-3] is not k_nodes[-2] is not v_nodes[-3]
-        if not is_same_origin:
-            print("swin-L fuse_attention: found qkv path but not has the same origin")
-            return
-        origin_matmul = q_nodes[-1]
-        fc_add = [q_nodes[-3], k_nodes[-3], v_nodes[-3]]
-        # Now fuse
-        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
-
-        # Fuse FC
-        weight = self.model.get_initializer(origin_matmul.input[1])
-        biases = [self.model.get_initializer(i.input[0]) for i in fc_add]
-        if not weight or not all(biases):
-            print("swin-L: couldn't find weights")
-            return
-        weight_arr = onnx.numpy_helper.to_array(weight).transpose(1, 0)
-        weight.CopyFrom(numpy_helper.from_array(weight_arr))
-        bias_arr = np.concatenate(
-            [onnx.numpy_helper.to_array(i) for i in biases], axis=0
-        )
-
-        fused_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[origin_matmul.input[0]],
-            outputs=fc_add[0].output,
-            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend(
-            [helper.make_attribute("out_dims", bias_arr.shape[0])]
-        )
-        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        fused_node.attribute.extend([helper.make_attribute("W", weight)])
-        fused_node.attribute.extend(
-            [helper.make_attribute("B", numpy_helper.from_array(bias_arr))]
-        )
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-        # Fuse Attention
-        attention_node = self.create_attention_node(
-            num_heads,
-            hidden_size,
-            [fused_node.output[0], relative_position_bias_name],
-            reshape_qkv.output[0],
-        )
-        if not attention_node:
-            return
-        self.nodes_to_add.append(attention_node)
-        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
-        self.nodes_to_remove.extend(
-            [*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes]
-        )
-        self.prune_graph = True
-
-    def fuse_tensor_in_node_attrs(self, fc_nodes, attr_name, tensor_name):
-        result = [get_tensor_attr(i.attribute, attr_name) for i in fc_nodes]
-        result = np.concatenate(result, axis=0)
-        result = numpy_helper.from_array(result, tensor_name)
-        return result
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
deleted file mode 100644
index bce0ab1713f20a19533e5793c4888607a7619c81..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
+++ /dev/null
@@ -1,495 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionT5EncoderAttention(Fusion):
-    """
-    Fuse T5Attention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQKVToContextPluginDynamic_IxRT",
-            ["CustomSkipLayerNormPluginDynamic_IxRT", "RMSNormPluginDynamic_IxRT"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape = self.model.get_initializer(reshape_q.input[1])
-        if q_shape is None:
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
-            return [0, 0]
-
-        q_shape_value = NumpyHelper.to_array(q_shape)
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(
-                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return [0, 0]
-
-        num_heads = q_shape_value[2]
-        head_size = q_shape_value[3]
-        hidden_size = num_heads * head_size
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self,
-        num_heads: int,
-        hidden_size: int,
-        input: str,
-        output: str,
-        matmul_qk_add: NodeProto,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        qk_bias = None
-        has_mask = 0
-        has_qk_bias = 0
-        add_input_is_value = False
-        if matmul_qk_add is not None:
-            has_qk_bias = 1
-            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
-            if qk_bias:
-                add_input_is_value = True
-                qk_bias_arr = NumpyHelper.to_array(qk_bias)
-                if len(qk_bias_arr.shape) == 3:
-                    qk_bias_arr = qk_bias_arr.squeeze(0)
-                has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
-                if np.any(has_neg_inf):
-                    qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
-                        np.float32
-                    )
-                qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
-
-        attention_inputs = [input]
-
-        # 如果add的输入不是值，而是一个边，那么这个边的值需要cast到fp32
-        cast_node = None
-        if not add_input_is_value:
-            cast_out_name = attention_node_name + "_fp32_in1"
-            cast_out_tensor = helper.make_tensor_value_info(
-                cast_out_name, TensorProto.FLOAT, [None, None, None, None]
-            )
-            # self.model.add_initializer(cast_out_name)
-            cast_node = helper.make_node(
-                "Cast",
-                inputs=[matmul_qk_add.input[1]],
-                outputs=[cast_out_tensor.name],
-                name=self.model.create_node_name("Cast"),
-                to=1,
-            )
-            self.node_name_to_graph_name[cast_node.name] = self.this_graph_name
-            attention_inputs.append(cast_out_name)
-
-        if has_qk_bias:
-            if add_input_is_value:
-                has_mask = 1
-                attention_inputs.append(qk_bias.name)
-            else:
-                has_mask = 1
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend(
-            [helper.make_attribute("has_qk_bias", has_qk_bias)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("is_t5_mode", 1)])
-
-        return attention_node, cast_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-        if normalize_node.op_type == "RMSNormPluginDynamic_IxRT":
-            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
-            if add_before_layernorm is not None:
-                start_node = add_before_layernorm
-
-        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
-        qkv_paths = {
-            "path1": (["MatMul", "Reshape", "Transpose", "MatMul"], [0, 0, 0, 0]),
-            "path2": (["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0]),
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-
-        if qkv_path in ["path1", "path2"]:
-            (atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
-
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
-
-            if input == qkv_nodes[0].output[0]:
-                continue
-            other_inputs.append(input)
-        if len(other_inputs) != 1:
-            return
-
-        root_input = other_inputs[0]
-        """
-        Match T5
-        Add/Gather --> LayerNormalization --> Attention --> Add --> LayerNormalization
-         |                                                  |
-         |                                                  |
-         +---------------------------------------------------
-        """
-        transpose_before_layernorm = self.model.match_parent(start_node, "Gather", 0)
-        if transpose_before_layernorm is not None:
-            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
-                    root_input = child.output[0]
-
-        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
-        if add_before_layernorm is not None:
-            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
-                    root_input = child.output[0]
-
-        v_paths = {
-            "path1": (
-                ["Transpose", "Reshape", "Split", "MatMul"],
-                [1, 0, 0, None],
-            )  # T5
-        }
-
-        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-        if v_path == "path1":
-            (_, _, _, matmul_in_qkv) = v_nodes
-
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-
-        qk_paths = {
-            "path1": (["Softmax", "MatMul"], [0, 0]),
-            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
-        }
-
-        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return
-
-        matmul_qk_add = None
-        if qk_path == "path1":
-            (_, matmul_qk) = qk_nodes
-        else:
-            (_, matmul_qk_add, matmul_qk) = qk_nodes
-
-        q_paths = {"path1": (["Transpose", "Reshape", "Split"], [0, 0, 0])}
-        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-
-        if q_path == "path1":
-            (_, reshape_q, split_q) = q_nodes
-            # print("   split_q.name : ", split_q.name)
-
-        k_paths = {
-            "path1": (["Transpose", "Reshape", "Split"], [1, 0, 0]),
-        }
-        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-
-        if k_nodes is None:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-
-        if k_path == "path1":
-            (_, _, split_k) = k_nodes
-
-        if (
-            matmul_in_qkv.input[0] == root_input
-            and split_q.input[0] == matmul_in_qkv.output[0]
-            and split_k.input[0] == matmul_in_qkv.output[0]
-        ):
-            attention_last_node = reshape_qkv
-
-            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
-
-            new_node, new_cast_node = self.create_attention_node(
-                num_heads,
-                hidden_size,
-                matmul_in_qkv.output[0],
-                attention_last_node.output[0],
-                matmul_qk_add,
-            )
-            if new_node is None:
-                return
-
-            self.nodes_to_add.append(new_node)
-            if new_cast_node:
-                self.nodes_to_add.append(new_cast_node)
-
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-            self.nodes_to_remove.extend(
-                [attention_last_node, transpose_qkv, matmul_qkv]
-            )
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes)
-            self.nodes_to_remove.extend(k_nodes)
-            self.nodes_to_remove.extend(v_nodes[:-2])
-
-
-class FusionT5DecoderAttention(Fusion):
-    """
-    Fuse T5Attention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQkvCrossToContext_IxRT",
-            ["Softmax"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape = self.model.get_initializer(reshape_q.input[1])
-        if q_shape is None:
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
-            return [0, 0]
-
-        q_shape_value = NumpyHelper.to_array(q_shape)
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(
-                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
-            )
-            return [0, 0]
-
-        num_heads = q_shape_value[2]
-        head_size = q_shape_value[3]
-        hidden_size = num_heads * head_size
-
-        return num_heads, hidden_size
-
-    def create_decoder_attention_node(
-        self, inputs: str, outputs: str, type_mask: int, has_mask: int
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-
-        attention_node_name = self.model.create_node_name("decoder_Attention")
-        attention_node = helper.make_node(
-            "CustomQkvCrossToContext_IxRT",
-            inputs=inputs,
-            outputs=outputs,
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("scale", 1.0)])
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("type_mask", type_mask)])
-
-        return attention_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        """
-         path1:
-
-         (query) ---------------->MatMul --> add -->softmax --->MatMul--->
-                                    /        /                    /
-         (key)   ---->Transpose -->         /                    /
-                                           /                    /
-         (mask)   ------------------------>                    /
-                                                              /
-         (value)--------------------------------------------->
-
-
-
-        path2:
-
-         (query) ---------------->MatMul ---------->softmax --->MatMul--->
-                                    /                             /
-         (key)   ---->Transpose -->                              /
-                                                                /
-                                                               /
-                                                              /
-         (value)--------------------------------------------->
-
-        """
-
-        start_node = node
-        qkv_paths = {
-            "path1": (
-                ["Add", "MatMul", "Transpose"],
-                [0, 0, 0],
-            ),  # float mask self attention,self attention key pass
-            "path2": (["MatMul", "Transpose"], [0, 0]),  # cross attention qery pass
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-        next_nodes = self.model.get_children(node)
-        if len(next_nodes) == 0:
-            return
-
-        if next_nodes[0].op_type != "MatMul":
-            return
-
-        second_matmul_node = next_nodes[0]
-        attention_inputs = None
-        attention_outputs = second_matmul_node.output
-        remove_nodes = [second_matmul_node, node]
-        if qkv_path == "path1":
-            (add_node, first_matmul_node, transpose_node) = qkv_nodes
-            transpose_nodes = self.model.get_parents(first_matmul_node)
-            q_input = transpose_nodes[0].output[0]
-            k_input = transpose_nodes[1].input[0]
-            v_input = second_matmul_node.input[1]
-            attention_inputs = [q_input, k_input, v_input]
-            remove_nodes.extend([add_node, first_matmul_node, transpose_nodes[1]])
-
-        if qkv_path == "path2":
-            (first_matmul_node, transpose_node) = qkv_nodes
-            transpose_nodes = self.model.get_parents(first_matmul_node)
-            q_input = transpose_nodes[0].output[0]
-            k_input = transpose_nodes[1].input[0]
-            v_input = second_matmul_node.input[1]
-            attention_inputs = [q_input, k_input, v_input]
-            remove_nodes.extend([first_matmul_node, transpose_nodes[1]])
-
-        has_mask = 0
-        type_mask = 4  # int32 mask
-
-        if qkv_path == "path1":
-            mask_input = add_node.input[0]
-            score_out = first_matmul_node.output[0]
-            if add_node.input[0] == score_out:
-                mask_input = add_node.input[1]
-            attention_inputs.append(mask_input)
-            has_mask = 1
-            type_mask = 3  # float mask
-
-        atten_node = self.create_decoder_attention_node(
-            attention_inputs, attention_outputs, type_mask, has_mask
-        )
-        self.nodes_to_add.append(atten_node)
-        self.node_name_to_graph_name[atten_node.name] = self.this_graph_name
-        self.nodes_to_remove.extend(remove_nodes)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
deleted file mode 100644
index 4765c8f51dbbf7b1f0da9e7821cc714665d1fbd8..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from logging import getLogger
-from typing import Tuple
-
-import numpy
-from numpy import array_equal, ndarray
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-from onnx import onnx_pb as onnx_proto
-
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionUtils:
-    def __init__(self, model: OnnxModel):
-        self.model: OnnxModel = model
-
-    def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
-        graph_input = self.model.find_graph_input(input_name)
-        if (
-            graph_input is not None
-            and graph_input.type.tensor_type.elem_type != TensorProto.INT32
-        ):
-            cast_output, cast_node = self.cast_input_to_int32(input_name)
-            logger.debug(f"Casted graph input {input_name} to int32")
-            return True, cast_output
-
-        logger.debug(
-            f"Did not cast graph input {input_name} to int32: found {graph_input is not None}"
-        )
-        return False, input_name
-
-    def cast_input_to_int32(self, input_name: str):
-        cast_output = input_name + "_int32"
-
-        # Avoid consequent Cast nodes.
-        inputs = [input_name]
-        output_name_to_node = self.model.output_name_to_node()
-        if input_name in output_name_to_node:
-            parent_node = output_name_to_node[input_name]
-            if parent_node and parent_node.op_type == "Cast":
-                inputs = [parent_node.input[0]]
-
-        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
-        cast_node.attribute.extend(
-            [helper.make_attribute("to", int(TensorProto.INT32))]
-        )
-        self.model.add_node(cast_node)
-
-        return cast_output, cast_node
-
-    def remove_cast_int32(self, input_name: str):
-        input_name_to_nodes = self.model.input_name_to_nodes()
-        nodes = input_name_to_nodes[input_name]
-        for node in nodes:
-            if node.op_type == "Cast":
-                is_int32 = False
-                for att in node.attribute:
-                    if att.name == "to" and att.i == int(TensorProto.INT32):
-                        is_int32 = True
-                        break
-                if is_int32:
-                    output_name = node.output[0]
-                    self.model.remove_node(node)
-                    self.model.replace_input_of_all_nodes(output_name, input_name)
-
-    @staticmethod
-    def check_node_attribute(
-        node, attribute_name: str, expected_value, default_value=None
-    ):
-        """Verify that a node has expected value for an attribute.
-
-        Args:
-            node (NodeProto): a node to check
-            attribute_name (str): name of attribute
-            expected_value (Any): expected value of the attribute
-            default_value (Any, optional): default value if the attribute does not exist. Defaults to None.
-
-        Returns:
-            bool: whether the check is passed or not
-        """
-        value = default_value
-        for attr in node.attribute:
-            if attr.name == attribute_name:
-                value = helper.get_attribute_value(attr)
-
-        if isinstance(expected_value, list):
-            return (
-                isinstance(value, ndarray) or isinstance(value, list)
-            ) and array_equal(expected_value, value, equal_nan=False)
-        else:
-            return value == expected_value
-
-    @staticmethod
-    def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
-        """Transpose a 2-D INT8 TensorProto
-        Args:
-            tensor (TensorProto): tensor to be transposed
-        Returns:
-            tensor (TensorProto): transposed tensor
-        """
-        if not isinstance(tensor, onnx_proto.TensorProto):
-            raise ValueError(
-                "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
-            )
-
-        if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
-            raise ValueError("Only INT8 2-D tensors can be transposed")
-
-        if tensor.raw_data:
-            int32_data = numpy.reshape(
-                numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims
-            )
-            int32_transposed_data = numpy.transpose(int32_data, [1, 0])
-            tensor.raw_data = int32_transposed_data.tobytes()
-
-        else:
-            raise ValueError("only raw buffer supported")
-
-        return tensor
-
-    @staticmethod
-    def check_qdq_node_for_fusion(
-        node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True
-    ):
-        """Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion.
-           It is a good candidate for fusion if:
-           (1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True`
-           (2) The Q/DQ node should have constant scale
-           (3) The Q/DQ node should have a zero point of 0
-        Args:
-            node (NodeProto): a Q/DQ node to check
-        Returns:
-            bool: whether the check is passed or not
-        """
-        if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
-            logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
-
-        scale = model.get_constant_value(node.input[1])
-
-        # Scale is not constant
-        if scale is None:
-            return False
-
-        # Not per-tensor quantization
-        scale_has_single_element = scale.ndim == 0 or (
-            scale.ndim == 1 and scale.shape[0] == 1
-        )
-        if allow_per_tensor_quantization_only and not scale_has_single_element:
-            return False
-
-        # If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec)
-        if len(node.input) == 2:
-            return True
-
-        # Zero point should be constant and should have a value of 0
-        zero_point = model.get_constant_value(node.input[2])
-
-        # Zero point and scale should have same number of dims
-        if scale.ndim != zero_point.ndim:
-            return False
-
-        # Zero point is not constant or zero point is not zero
-        if zero_point is None:
-            return False
-
-        return numpy.all(zero_point == 0)
-
-    def check_node_input_value(self, node, input_index: int, expected_value):
-        """Verify that a node has expected input value
-
-        Args:
-            node (NodeProto): a node to check
-            input_index (int): index of its input to be verified
-            expected_value (Any): expected value of the input
-
-        Returns:
-            bool: whether the check is passed or not
-        """
-        assert len(node.input) > input_index
-
-        value = self.model.get_constant_value(node.input[input_index])
-
-        if isinstance(expected_value, list):
-            return (
-                isinstance(value, ndarray) or isinstance(value, list)
-            ) and array_equal(expected_value, value, equal_nan=False)
-        else:
-            return value == expected_value
-
-    def remove_identity_nodes(self):
-        """Remove Identity nodes, except those right before graph output."""
-        nodes_to_remove = []
-        for node in self.model.nodes():
-            if node.op_type == "Identity":
-                if node.output[0] not in self.model.get_graphs_output_names():
-                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
-                    nodes_to_remove.append(node)
-
-        if nodes_to_remove:
-            self.model.remove_nodes(nodes_to_remove)
-            logger.info(f"Removed {len(nodes_to_remove)} Identity nodes")
-
-    def remove_cascaded_cast_nodes(self):
-        self.model.remove_cascaded_cast_nodes()
-
-    def remove_useless_cast_nodes(self):
-        self.model.remove_useless_cast_nodes()
-
-    def remove_useless_reshape_nodes(self):
-        """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape"""
-        shape_infer = self.model.infer_runtime_shape(update=True)
-        if shape_infer is None:
-            return
-
-        nodes_to_remove = []
-        for node in self.model.nodes():
-            if node.op_type == "Reshape":
-                input_shape = shape_infer.get_edge_shape(node.input[0])
-                output_shape = shape_infer.get_edge_shape(node.output[0])
-                if input_shape and output_shape and input_shape == output_shape:
-                    logger.info(
-                        f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}"
-                    )
-                    nodes_to_remove.append(node)
-
-        if nodes_to_remove:
-            graph_input_names = set(self.model.get_graphs_input_names())
-            graph_output_names = set(self.model.get_graphs_output_names())
-            for node in nodes_to_remove:
-                if bool(set(node.output) & graph_output_names):
-                    if not bool(set(node.input) & graph_input_names):
-                        self.model.replace_output_of_all_nodes(
-                            node.input[0], node.output[0]
-                        )
-                    else:
-                        continue
-                else:
-                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
-                self.model.remove_node(node)
-
-
-class NumpyHelper:
-    @staticmethod
-    def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
-        # When weights are in external data format but not presented, we can still test the optimizer with two changes:
-        # (1) set fill_zeros = True  (2) change load_external_data=False in optimizer.py
-        if fill_zeros:
-            from onnx import mapping
-
-            return ndarray(
-                shape=tensor.dims,
-                dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type],
-            )
-
-        return numpy_helper.to_array(tensor)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
deleted file mode 100644
index d3244b7a609da3d8bfda6f91ed606259093e59c4..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionVideoBertAttention(Fusion):
-    """
-    Fuse VideoBertAttention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQKVToContextPluginDynamic_IxRT",
-            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(
-        self, atten_matmul: NodeProto, div: NodeProto
-    ) -> Tuple[int, int]:
-        """Detect num_heads and hidden_size from a reshape node.
-
-        Args:
-            reshape_q (NodeProto): reshape node for Q
-
-        Returns:
-            Tuple[int, int]: num_heads and hidden_size
-        """
-
-        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
-        div_initializer = self.model.get_initializer(div.input[1])
-
-        # 检查float_data是否为空
-        if len(div_initializer.float_data) > 0:
-            div_value = div_initializer.float_data[0]
-        else:
-            # 如果float_data为空，尝试其他方式获取数据
-            # 例如，如果数据存储在raw_data中
-            if len(div_initializer.raw_data) > 0:
-                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
-                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
-            else:
-                raise ValueError("Data not found in the div_initializer")
-
-        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
-        head_dim = math.ceil(div_value * div_value)
-        hidden_size = atten_matul_shape_value[0]
-        num_heads = hidden_size // head_dim
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self,
-        num_heads: int,
-        hidden_size: int,
-        input: str,
-        output: str,
-        matmul_qk_add: NodeProto,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        qk_bias = None
-        has_mask = 0
-        has_qk_bias = 0
-        if matmul_qk_add is not None:
-            has_qk_bias = 1
-            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
-            qk_bias_arr = NumpyHelper.to_array(qk_bias)
-            if len(qk_bias_arr.shape) == 3:
-                qk_bias_arr = qk_bias_arr.squeeze(0)
-            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
-            if np.any(has_neg_inf):
-                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
-                    np.float32
-                )
-            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
-
-        attention_inputs = [input]
-
-        if qk_bias is not None:
-            has_mask = 1
-            attention_inputs.append(qk_bias.name)
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend(
-            [helper.make_attribute("has_qk_bias", has_qk_bias)]
-        )
-
-        return attention_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-        if normalize_node.op_type == "LayerNormalization":
-            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
-            if add_before_layernorm is not None:
-                start_node = add_before_layernorm
-
-        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
-        qkv_paths = {
-            "path1": (
-                ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                [0, None, 0, 0, 0],
-            ),
-            "path2": (
-                ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                [1, None, 0, 0, 0],
-            ),
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-
-        if qkv_path in ["path1", "path2"]:
-            (_, atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
-
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
-
-            if input == qkv_nodes[0].output[0]:
-                continue
-            other_inputs.append(input)
-        if len(other_inputs) != 1:
-            return
-
-        root_input = other_inputs[0]
-        """
-        Match videobert
-        transpose/Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
-         |                                                        |
-         |                                                        |
-         +---------------------------------------------------------
-        """
-        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
-        if transpose_before_layernorm is not None:
-            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
-        if add_before_layernorm is not None:
-            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        v_paths = {
-            "path1": (
-                ["Transpose", "Reshape", "Slice", "Add", "MatMul"],
-                [1, 0, 0, 0, None],
-            )  # videobert
-        }
-
-        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-        if v_path == "path1":
-            (_, _, _, add_in_qkv, matmul_in_qkv) = v_nodes
-
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-
-        qk_paths = {
-            "path1": (["Softmax", "MatMul"], [0, 0]),
-            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
-        }
-
-        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return
-
-        matmul_qk_add = None
-        if qk_path == "path1":
-            (_, matmul_qk) = qk_nodes
-        else:
-            (_, matmul_qk_add, matmul_qk) = qk_nodes
-
-        q_paths = {
-            "path1": (["Transpose", "Reshape", "Slice"], [0, 0, 0]),
-            "path2": (["Div", "Transpose", "Reshape", "Slice"], [0, 0, 0, 0]),
-        }
-        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-
-        if q_path == "path1":
-            (_, _, slice_q) = q_nodes
-        else:
-            (div, _, _, slice_q) = q_nodes
-
-        k_paths = {
-            "path1": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
-            "path2": (["Div", "Transpose", "Reshape", "Slice"], [1, 0, 0, 0]),
-        }
-        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-
-        if k_nodes is None:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-
-        if k_path == "path1":
-            (_, _, slice_k) = k_nodes
-        else:
-            (div, _, _, slice_k) = k_nodes
-
-        if (
-            matmul_in_qkv.input[0] == root_input
-            and slice_q.input[0] == add_in_qkv.output[0]
-            and slice_k.input[0] == add_in_qkv.output[0]
-        ):
-            attention_last_node = reshape_qkv
-
-            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
-                atten_matmul, div
-            )
-
-            new_node = self.create_attention_node(
-                num_heads,
-                hidden_size,
-                add_in_qkv.output[0],
-                attention_last_node.output[0],
-                matmul_qk_add,
-            )
-            if new_node is None:
-                return
-
-            self.nodes_to_add.append(new_node)
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-            self.nodes_to_remove.extend(
-                [attention_last_node, transpose_qkv, matmul_qkv]
-            )
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes)
-            self.nodes_to_remove.extend(k_nodes)
-            self.nodes_to_remove.extend(v_nodes[:-2])
-
-            # fuse head and tail transpose
-            if transpose_before_layernorm is not None:
-                node_children = input_name_to_nodes[
-                    transpose_before_layernorm.output[0]
-                ]
-                for child in node_children:
-                    for i, input in enumerate(child.input):
-                        if child.input[i] == transpose_before_layernorm.output[0]:
-                            child.input[i] = transpose_before_layernorm.input[0]
-                self.nodes_to_remove.extend([transpose_before_layernorm])
-
-                node = transpose_before_layernorm
-                while True:
-                    found = False
-                    node_children = input_name_to_nodes[node.output[0]]
-                    for child in node_children:
-                        if child is not None and child.op_type in [
-                            "SkipLayerNorm",
-                            "Add",
-                        ]:
-                            node = child
-                            found = True
-                            break
-                    if not found:
-                        break
-                node_children = input_name_to_nodes[node.output[0]]
-                if len(node_children) == 1 and node_children[0].op_type == "Transpose":
-                    transpose_node = node_children[0]
-                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
-                    for i, input in enumerate(transpose_children[0].input):
-                        if transpose_children[0].input[i] == transpose_node.output[0]:
-                            transpose_children[0].input[i] = transpose_node.input[0]
-                    self.nodes_to_remove.extend([transpose_node])
-            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
-            # self.nodes_to_remove.extend(mask_nodes)
-            # self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
deleted file mode 100644
index f1a5410b62283e45f4f0a8957eaf7e83be6a6124..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
+++ /dev/null
@@ -1,469 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-import math
-from typing import Dict
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_options import AttentionMaskFormat
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
-
-logger = getLogger(__name__)
-
-
-class FusionVITAttention(Fusion):
-    """
-    Fuse VITAttention subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(
-            model,
-            "CustomQKVToContextPluginDynamic_IxRT",
-            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
-        )
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def get_num_heads_and_hidden_size(
-        self, custom_fc: NodeProto, mul: NodeProto
-    ) -> Tuple[int, int]:
-        mul_initializer = self.model.get_initializer(mul.input[1])
-
-        # 检查float_data是否为空
-        if len(mul_initializer.float_data) > 0:
-            mul_value = mul_initializer.float_data[0]
-        else:
-            # 如果float_data为空，尝试其他方式获取数据
-            # 例如，如果数据存储在raw_data中
-            if len(mul_initializer.raw_data) > 0:
-                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
-                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
-            else:
-                raise ValueError("Data not found in the mul_initializer")
-
-        for attr in custom_fc.attribute:
-            if attr.name == "W":
-                tensor_value = attr.t
-                tensor_shape = [dim for dim in tensor_value.dims]
-                break
-        head_dim = math.floor(1.0 / (mul_value * mul_value)) * math.floor(
-            1.0 / (mul_value * mul_value)
-        )
-        hidden_size = tensor_shape[0]
-        num_heads = hidden_size // head_dim
-
-        return num_heads, hidden_size
-
-    def create_attention_node(
-        self,
-        num_heads: int,
-        hidden_size: int,
-        input: str,
-        output: str,
-        matmul_qk_add: NodeProto,
-    ) -> Union[NodeProto, None]:
-        """Create an Attention node.
-
-        Args:
-            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
-            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        assert num_heads > 0
-        # print(hidden_size, num_heads)
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(
-                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
-            )
-            return None
-
-        attention_node_name = self.model.create_node_name("Attention")
-
-        qk_bias = None
-        has_mask = 0
-        has_qk_bias = 0
-        if matmul_qk_add is not None:
-            has_qk_bias = 1
-            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
-            qk_bias_arr = NumpyHelper.to_array(qk_bias)
-            if len(qk_bias_arr.shape) == 3:
-                qk_bias_arr = qk_bias_arr.squeeze(0)
-            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
-            if np.any(has_neg_inf):
-                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
-                    np.float32
-                )
-            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
-
-        attention_inputs = [input]
-
-        if qk_bias is not None:
-            has_mask = 1
-            attention_inputs.append(qk_bias.name)
-
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=attention_inputs,
-            outputs=[output],
-            name=attention_node_name,
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend(
-            [helper.make_attribute("hidden_size", hidden_size)]
-        )
-        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend(
-            [helper.make_attribute("has_qk_bias", has_qk_bias)]
-        )
-
-        return attention_node
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
-        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
-        start_node = normalize_node
-        if normalize_node.op_type == "LayerNormalization":
-            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
-            if add_before_layernorm is not None:
-                start_node = add_before_layernorm
-
-        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
-        qkv_paths = {
-            "path1": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [0, 0, 0]),
-            "path2": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [1, 0, 0]),
-        }
-
-        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
-
-        if qkv_nodes is None:
-            logger.debug("fuse_attention: failed to match qkv path")
-            return
-
-        if qkv_path in ["path1", "path2"]:
-            (custom_fc_after_atten, transpose_qkv, matmul_qkv) = qkv_nodes
-
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
-
-            if input == qkv_nodes[0].output[0]:
-                continue
-            other_inputs.append(input)
-        if len(other_inputs) != 1:
-            return
-
-        root_input = other_inputs[0]
-        """
-        Match VIT
-        transpose --> LayerNormalization -->  custom_fc -> attention -> Add
-         |                                                                  |
-         |                                                                  |
-         +-------------------------------------------------------------------
-        """
-        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
-        if transpose_before_layernorm is not None:
-            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
-        if add_before_layernorm is not None:
-            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
-            for child in node_children:
-                if child is not None and child.op_type == "LayerNormalization":
-                    root_input = child.output[0]
-
-        # print("root_input: ", root_input, matmul_qkv.name)
-        v_paths = {
-            "path1": (
-                [
-                    "Reshape",
-                    "Transpose",
-                    "Reshape",
-                    "Gather",
-                    "Squeeze",
-                    "Transpose",
-                    "Unsqueeze",
-                    "Reshape",
-                    "CustomFCPluginDynamic_IxRT",
-                ],
-                [1, 0, 0, 0, 0, 0, 0, 0, 0],
-            )  # vit
-        }
-
-        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
-
-        squeeze_input = custom_fc = None
-        if v_path == "path1":
-            (_, _, _, _, squeeze_input, _, _, _, custom_fc) = v_nodes
-
-        if v_nodes is None:
-            logger.debug("fuse_attention: failed to match v path")
-            return
-
-        qk_paths = {
-            "path1": (["Softmax", "MatMul"], [0, 0]),
-            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
-        }
-
-        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
-        # print("qk_nodes:", qk_nodes[1].name)
-        if qk_nodes is None:
-            logger.debug("fuse_attention: failed to match qk path")
-            return
-
-        matmul_qk_add = None
-        if qk_path == "path1":
-            (_, matmul_qk) = qk_nodes
-        else:
-            (_, matmul_qk_add, matmul_qk) = qk_nodes
-
-        q_paths = {
-            "path1": (
-                ["Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze"],
-                [0, 0, 0, 0, 0, 0],
-            ),
-        }
-        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
-        # print("q_nodes:", q_nodes[0].name)
-        squeeze_q = mul_q = None
-        if q_path == "path1":
-            squeeze_q = q_nodes[-1]
-            mul_q = q_nodes[0]
-
-        if q_nodes is None:
-            logger.debug("fuse_attention: failed to match q path")
-            return
-
-        k_paths = {
-            "path1": (
-                [
-                    "Mul",
-                    "Transpose",
-                    "Reshape",
-                    "Transpose",
-                    "Reshape",
-                    "Gather",
-                    "Squeeze",
-                ],
-                [1, 0, 0, 0, 0, 0, 0],
-            ),
-        }
-        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
-        # print("k_nodes:", k_nodes[0].name)
-        squeeze_k = None
-        if k_path == "path1":
-            squeeze_k = k_nodes[-1]
-
-        if k_nodes is None:
-            logger.debug("fuse_attention: failed to match k path")
-            return
-
-        if (
-            custom_fc.input[0] == root_input
-            and squeeze_input == squeeze_q
-            and squeeze_input == squeeze_k
-        ):
-            attention_last_node = transpose_qkv
-
-            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
-                custom_fc_after_atten, mul_q
-            )
-
-            new_node = self.create_attention_node(
-                num_heads,
-                hidden_size,
-                custom_fc.output[0],
-                attention_last_node.output[0],
-                matmul_qk_add,
-            )
-            if new_node is None:
-                return
-
-            self.nodes_to_add.append(new_node)
-            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
-
-            self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
-            self.nodes_to_remove.extend(qk_nodes)
-            self.nodes_to_remove.extend(q_nodes[:-1])
-            self.nodes_to_remove.extend(k_nodes[:-1])
-            self.nodes_to_remove.extend(v_nodes[:-1])
-
-            # fuse head and tail transpose
-            if transpose_before_layernorm is not None:
-                node_children = input_name_to_nodes[
-                    transpose_before_layernorm.output[0]
-                ]
-                for child in node_children:
-                    for i, input in enumerate(child.input):
-                        if child.input[i] == transpose_before_layernorm.output[0]:
-                            child.input[i] = transpose_before_layernorm.input[0]
-                self.nodes_to_remove.extend([transpose_before_layernorm])
-
-                node = transpose_before_layernorm
-                while True:
-                    found = False
-                    node_children = input_name_to_nodes[node.output[0]]
-                    for child in node_children:
-                        if child is not None and child.op_type in [
-                            "SkipLayerNorm",
-                            "Add",
-                        ]:
-                            node = child
-                            found = True
-                            break
-                    if not found:
-                        break
-                node_children = input_name_to_nodes[node.output[0]]
-                if len(node_children) == 1 and node_children[0].op_type == "Transpose":
-                    transpose_node = node_children[0]
-                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
-                    for i, input in enumerate(transpose_children[0].input):
-                        if transpose_children[0].input[i] == transpose_node.output[0]:
-                            transpose_children[0].input[i] = transpose_node.input[0]
-                    self.nodes_to_remove.extend([transpose_node])
-            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
-            # self.nodes_to_remove.extend(mask_nodes)
-            # self.prune_graph = True
-
-
-class FusionTorchvisionVITAttention(Fusion):
-    """
-    Fuse VITAttention subgraph into one Attention node.
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(
-            model, "CustomQKVToContextPluginDynamic_IxRT", "CustomFCPluginDynamic_IxRT"
-        )
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        """
-        [Root] -->  CustomFCPluginDynamic_IxRT-->  CustomQKVToContextPluginDynamic_IxRT  --> CustomFCPluginDynamic_IxRT
-        """
-        children = self.model.get_children(node, input_name_to_nodes)
-        parent = self.model.get_parents(node, output_name_to_node)
-        
-        if len(children) != 1:
-            return
-        if len(parent) != 1:
-            return
-
-        fc_first_node = None
-        for par in parent:
-            fc_first_node = self.model.find_first_parent_by_type(
-                par, "CustomFCPluginDynamic_IxRT", output_name_to_node, recursive=True
-            )
-            if fc_first_node is not None:
-                break
-        if fc_first_node is None:
-            return
-        
-        start_node = node
-        
-        # v path
-        v_nodes = self.model.match_parent_path(
-            start_node,
-            ["Transpose", "MatMul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"],
-            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-        
-        # path1, q and k path
-        q_nodes = self.model.match_parent_path(
-            start_node,
-            ["Transpose", "MatMul", "Softmax", "MatMul", "Mul", "Transpose", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"],
-            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-        
-        k_nodes = self.model.match_parent_path(
-            start_node,
-            ["Transpose", "MatMul", "Softmax", "MatMul", "Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze", "Transpose", "Unsqueeze", "Reshape"],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            output_name_to_node,
-        )
-        
-        if v_nodes is None:
-            return
-        
-        if v_nodes and q_nodes and k_nodes:
-            subgraph_nodes = []
-            subgraph_nodes.extend(q_nodes)
-            subgraph_nodes.extend(k_nodes)
-            subgraph_nodes.extend(v_nodes)
-            
-            subgraph_nodes_unique = []
-            for item in subgraph_nodes:
-                if item not in subgraph_nodes_unique:
-                    subgraph_nodes_unique.append(item)
-            
-            hidden_size = start_node.attribute[0].i
-            _, mul_val = self.model.get_constant_input(k_nodes[4])
-            num_heads = hidden_size // (math.floor(1.0 / (mul_val * mul_val)) * math.floor(1.0 / (mul_val * mul_val)))
-            
-        attention_node = helper.make_node(
-            "CustomQKVToContextPluginDynamic_IxRT",
-            inputs=[fc_first_node.output[0]],
-            outputs=[start_node.input[0]],
-            name=self.model.create_node_name(
-                "TorchvisionVitAttention", name_prefix="TorchvisionVitAttention"
-            ),
-        )
-        attention_node.domain = "com.iluvatar"
-        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
-        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
-        attention_node.attribute.extend([helper.make_attribute("has_mask", 0)])
-        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 0)])
-        
-        self.nodes_to_remove.extend(subgraph_nodes_unique)
-        
-        self.nodes_to_add.append(attention_node)
-        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
deleted file mode 100644
index df55ba645988ddbffcd157e38db2c73ff34789a2..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from logging import getLogger
-from typing import Tuple, Union
-
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-class FusionXSoftmax(Fusion):
-    """
-    Fuse Where + Softmax + Where into one node: XSoftmax
-    """
-
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "XSoftmax_IxRT", "MatMul")
-
-    def create_xsoftmax_node(
-        self, data_input: str, mask_input: str, output: str
-    ) -> Union[NodeProto, None]:
-        """Create an XSoftmax node.
-
-        Args:
-            data_input (str): data input name
-            mask_input (str): max input name
-            output (str): output name
-
-        Returns:
-            Union[NodeProto, None]: the node created or None if failed.
-        """
-        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
-
-        xsoftmax_node = helper.make_node(
-            "XSoftmax_IxRT",
-            inputs=[data_input, mask_input],
-            outputs=[output],
-            name=xsoftmax_node_name,
-        )
-        xsoftmax_node.domain = "com.iluvatar"
-        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
-        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
-        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
-
-        return xsoftmax_node
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-
-        xsoftmax_paths = {
-            "path": (["Where", "Softmax", "Where", "Add"], [None, None, None, None]),
-        }
-        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
-            node, xsoftmax_paths
-        )
-
-        if xsoftmax_nodes is None:
-            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
-            return
-        else:
-            (tail_where, softmax, head_where, add) = xsoftmax_nodes
-            where_inputs = [i for i in tail_where.input if i in head_where.input]
-            assert len(where_inputs) == 1
-            mask_input = where_inputs[0]
-            data_input = add.output[0]
-            data_output = tail_where.output[0]
-
-            xsoftmax_node = self.create_xsoftmax_node(
-                data_input, mask_input, data_output
-            )
-
-            self.nodes_to_add.append(xsoftmax_node)
-            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
-            self.nodes_to_remove.append(tail_where)
-            self.nodes_to_remove.append(softmax)
-            self.nodes_to_remove.append(head_where)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
deleted file mode 100644
index f2d07ce96d60c5e8fbfc749d1049bad471525239..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-from enum import Enum
-from logging import getLogger
-from os import name
-from sys import path
-from typing import List, Tuple, Union
-
-import numpy as np
-from onnx import NodeProto, TensorProto, helper, numpy_helper
-
-from .fusion_base import Fusion
-from .fusion_utils import FusionUtils, NumpyHelper
-from .onnx_model import OnnxModel
-
-logger = getLogger(__name__)
-
-
-def get_tensor_attr(attrs, attr_name):
-    result = None
-    for i in attrs:
-        if i.name == attr_name:
-            return numpy_helper.to_array(i.t)
-    return result
-
-
-class FusionYoloV5Decoder(Fusion):
-    """
-    Fuse SwinL subgraph into one Attention node.
-    """
-
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
-        super().__init__(model, "YoloV5Decoder", ["Reshape"])
-
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        short_path = ["Concat", "Slice", "Sigmoid", "Transpose", "Reshape"]
-        paths = [
-            (["Concat", "Unsqueeze", "Gather", "Shape"], [1] + [None] * 3),
-            (
-                ["Concat", "Mul", "Add", "Sub", "Mul", "Slice", "Sigmoid", "Transpose"],
-                [0, 0] + [None] * 6,
-            ),
-            (
-                ["Concat", "Mul", "Pow", "Mul", "Slice", "Sigmoid", "Transpose"],
-                [0, 1] + [None] * 5,
-            ),
-            (short_path, [None] * 5),
-            (short_path + ["Concat", "Unsqueeze", "Gather", "Shape"], [None] * 9),
-        ]
-        paths_found = []
-        nodes_names_found = set()
-        nodes_found = []
-        for path_i in paths:
-            nodes = self.model.match_parent_path(normalize_node, path_i[0], path_i[1])
-            paths_found.append(nodes)
-            if nodes:
-                for n in nodes:
-                    if n.name not in nodes_names_found:
-                        nodes_names_found.add(n.name)
-                        nodes_found.append(n)
-        if not all(paths_found):
-            return
-        shape_node = paths_found[-1][-1]
-        params = self._find_yolov5_decoder_params(paths_found)
-        self._fuse_node(
-            inputs=shape_node.input, outputs=normalize_node.output, params=params
-        )
-        self.nodes_to_remove.extend(nodes_found)
-        self._delete_extra_output_edges(paths_found)
-        self.prune_graph = True
-
-    def _fuse_node(self, inputs, outputs, params):
-        fused_node = helper.make_node(
-            "YoloV5Decoder",
-            inputs=inputs,
-            outputs=outputs,
-            name=self.model.create_node_name("YoloV5Decoder"),
-        )
-        fused_node.attribute.extend(params)
-        self.nodes_to_add.append(fused_node)
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-
-    def _delete_extra_output_edges(self, paths_found):
-        transpose_node = paths_found[2][-1]
-        assert transpose_node.op_type == "Transpose"
-        out_edge = transpose_node.output[0]
-        for item in self.model.graph().output:
-            if item.name == out_edge:
-                self.model.graph().output.remove(item)
-                logger.warning(f"Output: {out_edge} is useless in graph, delete it")
-                return
-
-    def _find_yolov5_decoder_params(self, paths_found):
-        # num_class
-        concat_op = paths_found[0][0]
-        assert concat_op.op_type == "Concat"
-        num_class_arr = self.model.get_initializer(concat_op.input[2], True)
-        assert num_class_arr
-        num_class = (num_class_arr - 5).tolist()[0]
-        num_class = helper.make_attribute("num_class", num_class)
-
-        # stride
-        mul_op = paths_found[1][1]
-        assert mul_op.op_type == "Mul"
-        input_arrs = self.model.get_initializer_input_edges(mul_op.name, True)
-        assert len(input_arrs) == 1
-        stride = input_arrs[0].tolist()
-        stride = helper.make_attribute("stride", stride)
-
-        # anchor
-        mul_op = paths_found[2][1]
-        assert mul_op.op_type == "Mul"
-        anchor = self.model.get_initializer_input_edges(mul_op.name, True)
-        assert len(anchor) == 1
-        anchor = anchor[0]
-        anchor = anchor[0, :, 0, 0, :] if len(anchor.shape) == 5 else anchor[:, 0, 0, :]
-        anchor = helper.make_attribute("anchor", list(anchor.flatten()))
-
-        # fast_impl
-        fast_impl = helper.make_attribute("faster_impl", 1)
-
-        return [num_class, stride, anchor, fast_impl]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
deleted file mode 100644
index 0b76f660fce62ec0aa19b8c132a6ba51cf6fe319..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-import logging
-import os
-import sys
-from collections import deque
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-from onnx import (
-    AttributeProto,
-    GraphProto,
-    ModelProto,
-    NodeProto,
-    TensorProto,
-    helper,
-    numpy_helper,
-    save_model,
-)
-
-from .float16 import convert_float_to_float16
-from .shape_infer_helper import SymbolicShapeInferenceHelper
-
-logger = logging.getLogger(__name__)
-
-
-class OnnxModel:
-    def __init__(self, model):
-        self.initialize(model)
-        self.initializer_visited: Dict[str, bool] = {}
-
-    def initialize(self, model):
-        self.model: ModelProto = model
-        self._node_name_suffix: Dict[
-            str, int
-        ] = {}  # key is node name prefix, value is the last suffix generated
-        self.shape_infer_helper: SymbolicShapeInferenceHelper = None
-        self.enable_shape_infer: bool = True
-        self.all_graphs: Optional[List[GraphProto]] = None
-
-    def disable_shape_inference(self):
-        self.enable_shape_infer = False
-
-    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
-        if self.enable_shape_infer:
-            if self.shape_infer_helper is None or update:
-                self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
-
-            try:
-                if self.shape_infer_helper.infer(dynamic_axis_mapping):
-                    return self.shape_infer_helper
-            except:
-                self.enable_shape_infer = (
-                    False  # disable shape inference to suppress same error message.
-                )
-                print("failed in shape inference", sys.exc_info()[0])
-
-        return None
-
-    def input_name_to_nodes(self):
-        input_name_to_nodes = {}
-        for node in self.nodes():
-            for input_name in node.input:
-                if input_name not in input_name_to_nodes:
-                    input_name_to_nodes[input_name] = [node]
-                else:
-                    input_name_to_nodes[input_name].append(node)
-        return input_name_to_nodes
-
-    def output_name_to_node(self):
-        output_name_to_node = {}
-        for node in self.nodes():
-            for output_name in node.output:
-                output_name_to_node[output_name] = node
-        return output_name_to_node
-
-    def nodes(self):
-        all_nodes = []
-        for graph in self.graphs():
-            for node in graph.node:
-                all_nodes.append(node)
-        return all_nodes
-
-    def graph(self):
-        return self.model.graph
-
-    def graphs(self):
-        if self.all_graphs is not None:
-            return self.all_graphs
-        self.all_graphs = []
-        graph_queue = [self.model.graph]
-        while graph_queue:
-            graph = graph_queue.pop(0)
-            self.all_graphs.append(graph)
-            for node in graph.node:
-                for attr in node.attribute:
-                    if attr.type == AttributeProto.AttributeType.GRAPH:
-                        assert isinstance(attr.g, GraphProto)
-                        graph_queue.append(attr.g)
-                    if attr.type == AttributeProto.AttributeType.GRAPHS:
-                        for g in attr.graphs:
-                            assert isinstance(g, GraphProto)
-                            graph_queue.append(g)
-        return self.all_graphs
-
-    def get_graphs_input_names(self):
-        input_names = []
-        for graph in self.graphs():
-            for input in graph.input:
-                input_names.append(input.name)
-        return input_names
-
-    def get_graphs_output_names(self):
-        output_names = []
-        for graph in self.graphs():
-            for output in graph.output:
-                output_names.append(output.name)
-        return output_names
-
-    def get_graph_by_node(self, node):
-        for graph in self.graphs():
-            if node in graph.node:
-                return graph
-        return None
-
-    def get_graph_by_name(self, graph_name):
-        for graph in self.graphs():
-            if graph_name == graph.name:
-                return graph
-        return None
-
-    def get_topological_insert_id(self, graph, outputs):
-        for idx, node in enumerate(graph.node):
-            for input in node.input:
-                if input in outputs:
-                    return idx
-        return len(graph.node)
-
-    def remove_node(self, node):
-        for graph in self.graphs():
-            if node in graph.node:
-                graph.node.remove(node)
-
-    def remove_nodes(self, nodes_to_remove):
-        for node in nodes_to_remove:
-            self.remove_node(node)
-
-    def add_node(self, node, graph_name=None):
-        if graph_name is None or graph_name == self.model.graph.name:
-            self.model.graph.node.extend([node])
-        else:
-            graph = self.get_graph_by_name(graph_name)
-            insert_idx = self.get_topological_insert_id(graph, node.output)
-            graph.node.insert(insert_idx, node)
-
-    def add_nodes(self, nodes_to_add, node_name_to_graph_name=None):
-        if node_name_to_graph_name is None:
-            self.model.graph.node.extend(nodes_to_add)
-        else:
-            for node in nodes_to_add:
-                graph_name = node_name_to_graph_name[node.name]
-                self.add_node(node, graph_name)
-
-    def add_initializer(self, tensor, graph_name=None):
-        if graph_name is None or graph_name == self.model.graph.name:
-            self.model.graph.initializer.extend([tensor])
-        else:
-            graph = self.get_graph_by_name(graph_name)
-            graph.initializer.extend([tensor])
-
-    def add_input(self, input, graph_name=None):
-        if graph_name is None or graph_name == self.model.graph.name:
-            self.model.graph.input.extend([input])
-        else:
-            graph = self.get_graph_by_name(graph_name)
-            graph.input.extend([input])
-
-    @staticmethod
-    def replace_node_input(node, old_input_name, new_input_name):
-        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
-        for j in range(len(node.input)):
-            if node.input[j] == old_input_name:
-                node.input[j] = new_input_name
-
-    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
-        for node in self.model.graph.node:
-            OnnxModel.replace_node_input(node, old_input_name, new_input_name)
-
-    @staticmethod
-    def replace_node_output(node, old_output_name, new_output_name):
-        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
-        for j in range(len(node.output)):
-            if node.output[j] == old_output_name:
-                node.output[j] = new_output_name
-
-    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
-        for node in self.model.graph.node:
-            OnnxModel.replace_node_output(node, old_output_name, new_output_name)
-
-    def get_initializer(self, name, return_np_array=False):
-        for graph in self.graphs():
-            for tensor in graph.initializer:
-                if tensor.name == name:
-                    return numpy_helper.to_array(tensor) if return_np_array else tensor
-        return None
-
-    def get_node(self, op_name):
-        for graph in self.graphs():
-            for n in graph.node:
-                if n.name == op_name:
-                    return n
-        return None
-
-    def get_initializer_input_edges(self, op_name, return_np_array=False):
-        initializers = {i.name: i for graph in self.graphs() for i in graph.initializer}
-        node = self.get_node(op_name)
-        assert node
-        result = []
-        for i in node.input:
-            if i in initializers:
-                tensor = initializers[i]
-                tensor = numpy_helper.to_array(tensor) if return_np_array else tensor
-                result.append(tensor)
-        return result
-
-    def get_nodes_by_op_type(self, op_type):
-        nodes = []
-        for node in self.nodes():
-            if node.op_type == op_type:
-                nodes.append(node)
-        return nodes
-
-    def get_children(self, node, input_name_to_nodes=None):
-        if input_name_to_nodes is None:
-            input_name_to_nodes = self.input_name_to_nodes()
-
-        children = []
-        for output in node.output:
-            if output in input_name_to_nodes:
-                for node in input_name_to_nodes[output]:
-                    children.append(node)
-        return children
-
-    def get_parents(self, node, output_name_to_node=None):
-        if output_name_to_node is None:
-            output_name_to_node = self.output_name_to_node()
-
-        parents = []
-        for input in node.input:
-            if input in output_name_to_node:
-                parents.append(output_name_to_node[input])
-        return parents
-
-    def get_parent(self, node, i, output_name_to_node=None):
-        if output_name_to_node is None:
-            output_name_to_node = self.output_name_to_node()
-
-        if len(node.input) <= i:
-            return None
-
-        input = node.input[i]
-        if input not in output_name_to_node:
-            return None
-
-        return output_name_to_node[input]
-
-    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
-        """
-        Find parent node based on constraints on op_type.
-
-        Args:
-            node (str): current node name.
-            parent_op_type (str): constraint of parent node op_type.
-            output_name_to_node (dict): dictionary with output name as key, and node as value.
-            exclude (list): list of nodes that are excluded (not allowed to match as parent).
-
-        Returns:
-            parent: The matched parent node. None if not found.
-            index: The input index of matched parent node. None if not found.
-        """
-        for i, input in enumerate(node.input):
-            if input in output_name_to_node:
-                parent = output_name_to_node[input]
-                if parent.op_type == parent_op_type and parent not in exclude:
-                    return parent, i
-                else:
-                    logger.debug(
-                        f"To find first {parent_op_type}, current {parent.op_type}"
-                    )
-        return None, None
-
-    def match_parent(
-        self,
-        node,
-        parent_op_type,
-        input_index=None,
-        output_name_to_node=None,
-        exclude=[],
-        return_indice=None,
-    ):
-        """
-        Find parent node based on constraints on op_type and index.
-        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
-
-        Args:
-            node (str): current node name.
-            parent_op_type (str): constraint of parent node op_type.
-            input_index (int or None): only check the parent given input index of current node.
-            output_name_to_node (dict): dictionary with output name as key, and node as value.
-            exclude (list): list of nodes that are excluded (not allowed to match as parent).
-            return_indice (list): a list to append the input index when input_index is None.
-
-        Returns:
-            parent: The matched parent node.
-        """
-        assert node is not None
-        assert input_index is None or input_index >= 0
-
-        if output_name_to_node is None:
-            output_name_to_node = self.output_name_to_node()
-
-        if input_index is None:
-            parent, index = self.match_first_parent(
-                node, parent_op_type, output_name_to_node, exclude
-            )
-            if return_indice is not None:
-                return_indice.append(index)
-            return parent
-
-        if input_index >= len(node.input):
-            logger.debug(f"input_index {input_index} >= node inputs {len(node.input)}")
-            return None
-
-        parent = self.get_parent(node, input_index, output_name_to_node)
-        if (
-            parent is not None
-            and parent.op_type == parent_op_type
-            and parent not in exclude
-        ):
-            return parent
-
-        if parent is not None:
-            logger.debug(f"Expect {parent_op_type}, Got {parent.op_type}")
-
-        return None
-
-    def match_parent_paths(self, node, paths, output_name_to_node):
-        for i, path in enumerate(paths):
-            assert isinstance(path, List) or isinstance(path, Tuple)
-            return_indice = []
-            matched = self.match_parent_path(
-                node, path[0], path[1], output_name_to_node, return_indice
-            )
-            if matched:
-                return i, matched, return_indice
-        return -1, None, None
-
-    def match_parent_path(
-        self,
-        node,
-        parent_op_types,
-        parent_input_index,
-        output_name_to_node=None,
-        return_indice=None,
-    ):
-        """
-        Find a sequence of input edges based on constraints on parent op_type and index.
-        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
-
-        Args:
-            node (str): current node name.
-            parent_op_types (str): constraint of parent node op_type of each input edge.
-            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
-            output_name_to_node (dict): dictionary with output name as key, and node as value.
-            return_indice (list): a list to append the input index when there is no constraint on input index of an edge.
-
-        Returns:
-            parents: a list of matched parent node.
-        """
-        assert len(parent_input_index) == len(parent_op_types)
-
-        if output_name_to_node is None:
-            output_name_to_node = self.output_name_to_node()
-
-        current_node = node
-        matched_parents = []
-        for i, op_type in enumerate(parent_op_types):
-            matched_parent = self.match_parent(
-                current_node,
-                op_type,
-                parent_input_index[i],
-                output_name_to_node,
-                exclude=[],
-                return_indice=return_indice,
-            )
-            if matched_parent is None:
-                logger.debug(
-                    f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}",
-                    stack_info=True,
-                )
-                return None
-
-            matched_parents.append(matched_parent)
-            current_node = matched_parent
-
-        return matched_parents
-
-    def find_first_child_by_type(
-        self, node, child_type, input_name_to_nodes=None, recursive=True
-    ):
-        children = self.get_children(node, input_name_to_nodes)
-        dq = deque(children)
-        while len(dq) > 0:
-            current_node = dq.pop()
-            if current_node.op_type == child_type:
-                return current_node
-
-            if recursive:
-                children = self.get_children(current_node, input_name_to_nodes)
-                for child in children:
-                    dq.appendleft(child)
-
-        return None
-
-    def find_first_parent_by_type(
-        self, node, parent_type, output_name_to_node=None, recursive=True
-    ):
-        if output_name_to_node is None:
-            output_name_to_node = self.output_name_to_node()
-
-        parents = self.get_parents(node, output_name_to_node)
-        dq = deque(parents)
-        while len(dq) > 0:
-            current_node = dq.pop()
-            if current_node.op_type == parent_type:
-                return current_node
-
-            if recursive:
-                parents = self.get_parents(current_node, output_name_to_node)
-                for parent in parents:
-                    dq.appendleft(parent)
-
-        return None
-
-    def get_constant_value(self, output_name):
-        for node in self.get_nodes_by_op_type("Constant"):
-            if node.output[0] == output_name:
-                for att in node.attribute:
-                    if att.name == "value":
-                        return numpy_helper.to_array(att.t)
-
-        # Fall back to intializer since constant folding might have been applied.
-        initializer = self.get_initializer(output_name)
-        if initializer is not None:
-            return numpy_helper.to_array(initializer)
-
-        return None
-
-    def get_constant_input(self, node):
-        for i, input in enumerate(node.input):
-            value = self.get_constant_value(input)
-            if value is not None:
-                return i, value
-
-        return None, None
-
-    def find_constant_input(self, node, expected_value, delta=0.000001):
-        i, value = self.get_constant_input(node)
-        if (
-            value is not None
-            and value.size == 1
-            and abs(value - expected_value) < delta
-        ):
-            return i
-
-        return -1
-
-    def is_constant_with_specified_dimension(
-        self, output_name, dimensions, description
-    ):
-        value = self.get_constant_value(output_name)
-        if value is None:
-            logger.debug(f"{description} {output_name} is not initializer.")
-            return False
-
-        if len(value.shape) != dimensions:
-            logger.debug(
-                f"{description} {output_name} shall have {dimensions} dimensions. Got shape {value.shape}"
-            )
-            return False
-
-        return True
-
-    def has_constant_input(self, node, expected_value, delta=0.000001):
-        return self.find_constant_input(node, expected_value, delta) >= 0
-
-    def get_children_subgraph_nodes(
-        self, root_node, stop_nodes, input_name_to_nodes=None
-    ):
-        if input_name_to_nodes is None:
-            input_name_to_nodes = self.input_name_to_nodes()
-
-        children = input_name_to_nodes[root_node.output[0]]
-
-        unique_nodes = []
-
-        dq = deque(children)
-        while len(dq) > 0:
-            current_node = dq.pop()
-            if current_node in stop_nodes:
-                continue
-
-            if current_node not in unique_nodes:
-                unique_nodes.append(current_node)
-
-                for output in current_node.output:
-                    if output in input_name_to_nodes:
-                        children = input_name_to_nodes[output]
-                        for child in children:
-                            dq.appendleft(child)
-
-        return unique_nodes
-
-    def tensor_shape_to_list(self, tensor_type):
-        """Convert tensor shape to list"""
-        shape_list = []
-        for d in tensor_type.shape.dim:
-            if d.HasField("dim_value"):
-                shape_list.append(d.dim_value)  # known dimension
-            elif d.HasField("dim_param"):
-                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
-            else:
-                shape_list.append("?")  # shall not happen
-        return shape_list
-
-    def get_dtype(self, input_or_output: str):
-        """Try get data type given a name (could be initializer, graph input or output)."""
-        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
-
-        if input_or_output in tensor_type_map:
-            return tensor_type_map[input_or_output].tensor_type.elem_type
-
-        graph_input = self.find_graph_input(input_or_output)
-        if graph_input:
-            return graph_input.type.tensor_type.elem_type
-
-        graph_output = self.find_graph_output(input_or_output)
-        if graph_output:
-            return graph_output.type.tensor_type.elem_type
-
-        return None
-
-    @staticmethod
-    def get_node_attribute(node: NodeProto, attribute_name: str):
-        for attr in node.attribute:
-            if attr.name == attribute_name:
-                value = helper.get_attribute_value(attr)
-                return value
-        return None
-
-    def remove_cascaded_cast_nodes(self):
-        """Remove Cast node that are followed by another Cast node like  --> Cast --> Cast -->
-        Note that this shall be used carefully since it might introduce semantic change.
-        For example, float -> int -> float could get different value than the original float value.
-        So, it is recommended to used only in post-processing of mixed precision conversion.
-        """
-        output_name_to_node = self.output_name_to_node()
-        removed_count = 0
-        for node in self.nodes():
-            if node.op_type == "Cast":
-                parent = self.get_parent(
-                    node, 0, output_name_to_node=output_name_to_node
-                )
-                if parent and parent.op_type == "Cast":
-                    node.input[0] = parent.input[0]
-                    removed_count += 1
-
-        if removed_count > 0:
-            logger.info("Removed %d cascaded Cast nodes", removed_count)
-            self.prune_graph()
-
-    def remove_useless_cast_nodes(self):
-        """Remove cast nodes that are not needed: input and output has same data type."""
-        shape_infer = self.infer_runtime_shape(update=True)
-        if shape_infer is None:
-            logger.info(
-                f"Skip removing useless cast nodes since shape inference failed."
-            )
-            return
-
-        def get_data_type(input_or_output_name):
-            dtype = self.get_dtype(input_or_output_name)
-            if dtype:
-                return dtype
-            if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField(
-                "elem_type"
-            ):
-                return shape_infer.known_vi_[
-                    input_or_output_name
-                ].type.tensor_type.elem_type
-            return None
-
-        nodes_to_remove = []
-        for node in self.nodes():
-            if node.op_type == "Cast":
-                input_dtype = get_data_type(node.input[0])
-                output_dtype = get_data_type(node.output[0])
-                if input_dtype and input_dtype == output_dtype:
-                    nodes_to_remove.append(node)
-
-        if nodes_to_remove:
-            graph_input_names = set(self.get_graphs_input_names())
-            graph_output_names = set(self.get_graphs_output_names())
-            for node in nodes_to_remove:
-                if bool(set(node.output) & graph_output_names):
-                    if not bool(set(node.input) & graph_input_names):
-                        self.replace_output_of_all_nodes(node.input[0], node.output[0])
-                    else:
-                        continue
-                else:
-                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
-                self.remove_node(node)
-
-            logger.info(
-                "Removed %d Cast nodes with output type same as input",
-                len(nodes_to_remove),
-            )
-
-    def convert_model_float32_to_float16(self, cast_input_output=True):
-        logger.warning(
-            "The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!"
-        )
-        self.convert_float_to_float16(
-            use_symbolic_shape_infer=True, keep_io_types=cast_input_output
-        )
-
-    def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
-        """Convert a model to half (default) or mixed precision.
-           To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32.
-           By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used.
-           Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information.
-
-        Args:
-            use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True.
-            keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
-                                                              If True, model inputs/outputs should be left as float32. Defaults to False.
-            op_block_list (List[str], optional): List of operator types to leave as float32.
-                                                 Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
-            node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
-            force_fp16_initializers(bool): force converting all float initializers to float16.
-                                           Default to false, which will convert only the one needed to avoid precision loss.
-            min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
-            max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
-        """
-        if "keep_io_types" not in kwargs:
-            kwargs["keep_io_types"] = True
-
-        model = self.model
-        if use_symbolic_shape_infer:
-            # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
-            shape_infer_helper = SymbolicShapeInferenceHelper(model)
-            model = shape_infer_helper.infer_shapes(
-                model, auto_merge=True, guess_output_rank=False
-            )
-
-        parameters = {"disable_shape_infer": use_symbolic_shape_infer}
-        parameters.update(
-            {
-                key: kwargs[key]
-                for key in [
-                    "keep_io_types",
-                    "min_positive_val",
-                    "max_finite_val",
-                    "op_block_list",
-                    "node_block_list",
-                    "force_fp16_initializers",
-                ]
-                if key in kwargs
-            }
-        )
-
-        fp16_model = convert_float_to_float16(model, **parameters)
-        self.initialize(fp16_model)
-
-        self.remove_cascaded_cast_nodes()
-
-        self.remove_useless_cast_nodes()
-
-    def create_node_name(self, op_type, name_prefix=None):
-        """Create a unique node name that starts with a prefix (default is operator type).
-           The name will not be duplicated with any name that generated or existed in current graphs.
-        Args:
-            op_type (str): operator type
-            name_prefix (str, optional): prefix of node name. Defaults to None.
-
-        Returns:
-            str: node name
-        """
-
-        if name_prefix:
-            prefix = name_prefix if name_prefix.endswith("_") else (name_prefix + "_")
-        else:
-            prefix = op_type + "_"
-
-        suffix: int = 0
-        if prefix in self._node_name_suffix:
-            suffix = self._node_name_suffix[prefix] + 1
-        else:
-            # Check existed node name only once for a prefix as we assume create_node_name is called for every new node in fusion.
-            for node in self.nodes():
-                if node.name and node.name.startswith(prefix):
-                    try:
-                        index = int(node.name[len(prefix) :])
-                        suffix = max(index + 1, suffix)
-                    except ValueError:
-                        continue
-
-        # Record the generated suffix so that we can avoid generating duplicated name.
-        self._node_name_suffix[prefix] = suffix
-
-        return prefix + str(suffix)
-
-    def find_graph_input(self, input_name):
-        for input in self.model.graph.input:
-            if input.name == input_name:
-                return input
-        return None
-
-    def find_graph_output(self, output_name):
-        for output in self.model.graph.output:
-            if output.name == output_name:
-                return output
-        return None
-
-    def get_parent_subgraph_nodes(self, node, stop_nodes, output_name_to_node=None):
-        if output_name_to_node is None:
-            output_name_to_node = self.output_name_to_node()
-
-        unique_nodes = []
-
-        parents = self.get_parents(node, output_name_to_node)
-        dq = deque(parents)
-        while len(dq) > 0:
-            current_node = dq.pop()
-            if current_node in stop_nodes:
-                continue
-
-            if current_node not in unique_nodes:
-                unique_nodes.append(current_node)
-
-                for input in current_node.input:
-                    if input in output_name_to_node:
-                        dq.appendleft(output_name_to_node[input])
-
-        return unique_nodes
-
-    def get_graph_inputs(self, current_node, recursive=False):
-        """
-        Find graph inputs that linked to current node.
-        """
-        graph_inputs = []
-        for input in current_node.input:
-            if self.find_graph_input(input) and input not in graph_inputs:
-                graph_inputs.append(input)
-
-        if recursive:
-            parent_nodes = self.get_parent_subgraph_nodes(current_node, [])
-            for node in parent_nodes:
-                for input in node.input:
-                    if self.find_graph_input(input) and input not in graph_inputs:
-                        graph_inputs.append(input)
-        return graph_inputs
-
-    @staticmethod
-    def input_index(node_output, child_node):
-        index = 0
-        for input in child_node.input:
-            if input == node_output:
-                return index
-            index += 1
-        return -1
-
-    def remove_unused_constant(self):
-        input_name_to_nodes = self.input_name_to_nodes()
-
-        # remove unused constant
-        unused_nodes = []
-        nodes = self.nodes()
-        for node in nodes:
-            if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
-                unused_nodes.append(node)
-
-        self.remove_nodes(unused_nodes)
-
-        if len(unused_nodes) > 0:
-            logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}")
-
-    def prune_graph(self, outputs=None):
-        """
-        Prune graph to keep only required outputs. It removes unnecessary inputs and nodes.
-        Nodes are not linked (directly or indirectly) to any required output will be removed.
-
-        Args:
-            outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept.
-        """
-        if len(self.graphs()) > 1:
-            logger.debug(f"Skip prune_graph since graph has subgraph")
-            return
-
-        if outputs is None:
-            outputs = [output.name for output in self.model.graph.output]
-
-        output_name_to_node = self.output_name_to_node()
-        all_nodes = []
-        for output in outputs:
-            if output in output_name_to_node:
-                last_node = output_name_to_node[output]
-                if last_node in all_nodes:
-                    continue
-                nodes = self.get_parent_subgraph_nodes(last_node, [])
-                all_nodes.append(last_node)
-                all_nodes.extend(nodes)
-
-        nodes_to_remove = []
-        for node in self.model.graph.node:
-            if node not in all_nodes:
-                nodes_to_remove.append(node)
-
-        self.remove_nodes(nodes_to_remove)
-
-        # remove outputs not in list
-        output_to_remove = []
-        for output in self.model.graph.output:
-            if output.name not in outputs:
-                output_to_remove.append(output)
-        for output in output_to_remove:
-            self.model.graph.output.remove(output)
-
-        # remove inputs not used by any node.
-        input_name_to_nodes = self.input_name_to_nodes()
-        input_to_remove = []
-        for input in self.model.graph.input:
-            if input.name not in input_name_to_nodes:
-                input_to_remove.append(input)
-        for input in input_to_remove:
-            self.model.graph.input.remove(input)
-
-        if input_to_remove or output_to_remove or nodes_to_remove:
-            logger.info(
-                "Graph pruned: {} inputs, {} outputs and {} nodes are removed".format(
-                    len(input_to_remove), len(output_to_remove), len(nodes_to_remove)
-                )
-            )
-
-        self.update_graph()
-
-    def update_graph(self, verbose=False):
-        graph = self.model.graph
-
-        remaining_input_names = []
-        for node in graph.node:
-            if node.op_type in ["Loop", "Scan", "If"]:
-                # TODO: handle inner graph
-                logger.debug(
-                    f"Skip update_graph since graph has operator: {node.op_type}"
-                )
-                return
-            if node.op_type != "Constant":
-                for input_name in node.input:
-                    if input_name not in remaining_input_names:
-                        remaining_input_names.append(input_name)
-        if verbose:
-            logger.debug(f"remaining input names: {remaining_input_names}")
-
-        # remove graph input that is not used
-        inputs_to_remove = []
-        for input in graph.input:
-            if input.name not in remaining_input_names:
-                inputs_to_remove.append(input)
-        for input in inputs_to_remove:
-            graph.input.remove(input)
-
-        names_to_remove = [input.name for input in inputs_to_remove]
-        logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}")
-
-        # remove weights that are not used
-        weights_to_remove = []
-        weights_to_keep = []
-        for initializer in graph.initializer:
-            if (
-                initializer.name not in remaining_input_names
-                and not self.find_graph_output(initializer.name)
-            ):
-                weights_to_remove.append(initializer)
-            else:
-                weights_to_keep.append(initializer.name)
-        for initializer in weights_to_remove:
-            graph.initializer.remove(initializer)
-
-        names_to_remove = [initializer.name for initializer in weights_to_remove]
-        logger.debug(
-            f"remove {len(weights_to_remove)} unused initializers: {names_to_remove}"
-        )
-        if verbose:
-            logger.debug(f"remaining initializers:{weights_to_keep}")
-
-        self.remove_unused_constant()
-
-    def is_safe_to_fuse_nodes(
-        self, nodes_to_remove, keep_outputs, input_name_to_nodes, output_name_to_node
-    ):
-        for node_to_remove in nodes_to_remove:
-            for output_to_remove in node_to_remove.output:
-                if output_to_remove in keep_outputs:
-                    continue
-
-                if output_to_remove in input_name_to_nodes:
-                    for impacted_node in input_name_to_nodes[output_to_remove]:
-                        if impacted_node not in nodes_to_remove:
-                            logger.debug(
-                                f"it is not safe to remove nodes since output {output_to_remove} is used by {impacted_node}"
-                            )
-                            return False
-        return True
-
-    @staticmethod
-    def graph_topological_sort(graph):
-        deps_count = [0] * len(graph.node)  # dependency count of each node
-        deps_to_nodes = {}  # input to node indice
-        sorted_nodes = []  # initialize sorted_nodes
-        for node_idx, node in enumerate(graph.node):
-            # CANNOT use len(node.input) directly because input can be optional
-            deps_count[node_idx] = sum(1 for _ in node.input if _)
-            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
-                sorted_nodes.append(graph.node[node_idx])
-                continue
-
-            for input_name in node.input:
-                if input_name not in deps_to_nodes:
-                    deps_to_nodes[input_name] = [node_idx]
-                else:
-                    deps_to_nodes[input_name].append(node_idx)
-
-        # Note: this logic only applies to top level graph since a sub graph could use intializer from parent graph
-        initializer_names = [init.name for init in graph.initializer]
-        graph_input_names = [input.name for input in graph.input]
-        input_names = initializer_names + graph_input_names
-        input_names.sort()
-        prev_input_name = None
-        for input_name in input_names:
-            if prev_input_name == input_name:
-                continue
-
-            prev_input_name = input_name
-            if input_name in deps_to_nodes:
-                for node_idx in deps_to_nodes[input_name]:
-                    deps_count[node_idx] = deps_count[node_idx] - 1
-                    if deps_count[node_idx] == 0:
-                        sorted_nodes.append(graph.node[node_idx])
-
-        start = 0
-        end = len(sorted_nodes)
-
-        while start < end:
-            for output in sorted_nodes[start].output:
-                if output in deps_to_nodes:
-                    for node_idx in deps_to_nodes[output]:
-                        deps_count[node_idx] = deps_count[node_idx] - 1
-                        if deps_count[node_idx] == 0:
-                            sorted_nodes.append(graph.node[node_idx])
-                            end = end + 1
-            start = start + 1
-
-        if end != len(graph.node):
-            raise RuntimeError(
-                f"Graph is not a DAG: end={end}, len(graph.node)={len(graph.node)}, graph.node[end]={graph.node[end]}"
-            )
-
-        graph.ClearField("node")
-        graph.node.extend(sorted_nodes)
-
-    def topological_sort(self):
-        # TODO: support graph_topological_sort() in subgraphs
-        # for graph in self.graphs():
-        #    self.graph_topological_sort(graph)
-        OnnxModel.graph_topological_sort(self.model.graph)
-
-    @staticmethod
-    def save(
-        model,
-        output_path,
-        save_as_external_data=False,
-        all_tensors_to_one_file=True,
-        size_threshold=1024,
-        convert_attribute=False,
-    ):
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
-        if save_as_external_data:
-            # Save model to external data, which is needed for model size > 2GB
-            output_dir = Path(output_path).parent
-            output_dir.mkdir(parents=True, exist_ok=True)
-            external_data_path = output_path + ".data"
-            location = (
-                Path(external_data_path).name if all_tensors_to_one_file else None
-            )
-
-            if os.path.exists(output_path):
-                logger.info(f"Delete the existed onnx file: {output_path}")
-                os.remove(output_path)
-
-            if all_tensors_to_one_file:
-                if os.path.exists(external_data_path):
-                    # Delete the external data file. Otherwise, data will be appended to existing file.
-                    logger.info(
-                        f"Delete the existed external data file: {external_data_path}"
-                    )
-                    os.remove(external_data_path)
-            else:
-                if os.listdir(output_dir):
-                    raise RuntimeError(
-                        f"Output directory ({output_dir}) for external data is not empty."
-                    )
-
-            save_model(
-                model,
-                output_path,
-                save_as_external_data=True,
-                all_tensors_to_one_file=all_tensors_to_one_file,
-                location=location,
-                size_threshold=size_threshold,
-                convert_attribute=convert_attribute,
-            )
-        else:
-            save_model(model, output_path)
-
-    def save_model_to_file(
-        self, output_path, use_external_data_format=False, all_tensors_to_one_file=True
-    ):
-        logger.info(f"Sort graphs in topological order")
-        self.topological_sort()
-
-        if output_path.endswith(".json"):  # Output text for testing small model.
-            with open(output_path, "w") as out:
-                out.write(str(model))
-        else:
-            OnnxModel.save(
-                self.model,
-                output_path,
-                use_external_data_format,
-                all_tensors_to_one_file,
-            )
-        logger.info(f"Model saved to {output_path}")
-
-    def get_graph_inputs_excluding_initializers(self):
-        """
-        Returns real graph inputs (excluding initializers from older onnx model).
-        """
-        graph_inputs = []
-        for input in self.model.graph.input:
-            if self.get_initializer(input.name) is None:
-                graph_inputs.append(input)
-        return graph_inputs
-
-    def get_opset_version(self):
-        """Get opset version of onnx domain
-
-        Raises:
-            RuntimeError: ONNX model has no opset for default domain.
-
-        Returns:
-            int: opset version of onnx domain.
-        """
-        for opset in self.model.opset_import:
-            if opset.domain in ["", "ai.onnx"]:
-                return opset.version
-        raise RuntimeError("ONNX model has no opset for default domain")
-
-    @staticmethod
-    def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
-        """Returns True when two tensors have same value.
-           Note that name can be different.
-
-        Args:
-            tensor1 (TensorProto): initializer 1
-            tensor2 (TensorProto): initializer 2
-
-        Returns:
-            bool: True when two intializers has same value.
-        """
-        if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims:
-            return False
-        if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
-            return tensor1.raw_data == tensor2.raw_data
-        return numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)
-
-    def remove_duplicated_initializer(self):
-        """Remove initializers with duplicated values, and only keep the first one.
-        It could help reduce size of models (like ALBert) with shared weights.
-        Note: this function does not process subgraph.
-        """
-        if len(self.graphs()) > 1:
-            logger.warning("remove_duplicated_initializer does not process subgraphs.")
-
-        initializer_count = len(self.model.graph.initializer)
-
-        same = [-1] * initializer_count
-        for i in range(initializer_count - 1):
-            if same[i] >= 0:
-                continue
-            for j in range(i + 1, initializer_count):
-                if OnnxModel.has_same_value(
-                    self.model.graph.initializer[i], self.model.graph.initializer[j]
-                ):
-                    same[j] = i
-
-        count = 0
-        for i in range(initializer_count):
-            if same[i] >= 0:
-                count += 1
-                self.replace_input_of_all_nodes(
-                    self.model.graph.initializer[i].name,
-                    self.model.graph.initializer[same[i]].name,
-                )
-
-        if count > 0:
-            self.update_graph()
-            print(f"Removed {count} initializers with duplicated value")
-
-    def add_prefix_to_names(self, prefix: str):
-        """Add prefix to initializer or intermediate outputs in graph. Main graph inputs and outputs are excluded.
-        It could help avoid conflicting in name of node_args when merging two graphs.
-        Note: this function does not process subgraph.
-        """
-        if len(self.graphs()) > 1:
-            logger.warning("add_prefix_to_names does not process subgraphs.")
-
-        # Exclude the names of inputs and outputs of main graph (but not subgraphs)
-        excluded = [i.name for i in self.model.graph.input] + [
-            o.name for o in self.model.graph.output
-        ]
-
-        for initializer in self.model.graph.initializer:
-            if initializer.name not in excluded:
-                if prefix + initializer.name not in excluded:
-                    initializer.name = prefix + initializer.name
-
-        for node in self.model.graph.node:
-            # update name of node inputs
-            for j in range(len(node.input)):
-                if node.input[j] not in excluded:
-                    if prefix + node.input[j] not in excluded:
-                        node.input[j] = prefix + node.input[j]
-
-            # update name of node outputs
-            for j in range(len(node.output)):
-                if node.output[j] not in excluded:
-                    if prefix + node.output[j] not in excluded:
-                        node.output[j] = prefix + node.output[j]
-
-        for value_info in self.model.graph.value_info:
-            if value_info.name not in excluded:
-                value_info.name = prefix + value_info.name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
deleted file mode 100644
index a48b53db83fa675713cd9e4ac3b38d2ed554a73b..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-import logging
-import os
-import sys
-from typing import Dict
-
-# In ORT Package the symbolic_shape_infer.py is in ../tools
-file_path = os.path.dirname(__file__)
-if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")):
-    sys.path.append(os.path.join(file_path, "../tools"))
-else:
-    sys.path.append(os.path.join(file_path, ".."))
-
-from .symbolic_shape_infer import (
-    SymbolicShapeInference,
-    get_shape_from_type_proto,
-    sympy,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class SymbolicShapeInferenceHelper(SymbolicShapeInference):
-    def __init__(
-        self,
-        model,
-        verbose=0,
-        int_max=2**31 - 1,
-        auto_merge=True,
-        guess_output_rank=False,
-    ):
-        super().__init__(int_max, auto_merge, guess_output_rank, verbose)
-        self.model_ = model
-        self.all_shapes_inferred_: bool = False
-        self.is_inferred_: bool = False
-        self.dynamic_axis_mapping_: Dict[str, int] = {}
-
-    def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128):
-        """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided.
-
-        Args:
-            dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4}
-            max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32.
-
-        Returns:
-            bool: whether all shapes has been inferred or not.
-        """
-        assert dynamic_axis_mapping is not None
-
-        if self.is_inferred_ and self.dynamic_axis_mapping_ == dynamic_axis_mapping:
-            return self.all_shapes_inferred_
-
-        self.dynamic_axis_mapping_ = dynamic_axis_mapping
-
-        self._preprocess(self.model_)
-
-        count = 0
-        while self.run_:
-            logger.debug(f"shape infer run {count}")
-            self.all_shapes_inferred_ = self._infer_impl()
-            count += 1
-            if max_runs > 0 and count >= max_runs:
-                break
-
-        self.is_inferred_ = True
-        return self.all_shapes_inferred_
-
-    def _get_sympy_shape(self, node, idx):
-        """Override it to ensure shape inference by giving the actual value of dynamic axis."""
-        sympy_shape = []
-
-        shape = self._get_shape(node, idx)
-        if shape:
-            for dim in shape:
-                if isinstance(dim, str):
-                    if dim in self.dynamic_axis_mapping_:
-                        sympy_shape.append(self.dynamic_axis_mapping_[dim])
-                    elif dim in self.symbolic_dims_:
-                        sympy_shape.append(self.symbolic_dims_[dim])
-                    else:
-                        sympy_shape.append(sympy.Symbol(dim, integer=True))
-                else:
-                    assert dim is not None
-                    sympy_shape.append(dim)
-        return sympy_shape
-
-    def get_edge_shape(self, edge):
-        """Get shape of an edge.
-
-        Args:
-            edge (str): name of edge
-
-        Returns:
-            Optional[List[int]]: the shape, or None if shape is unknown
-        """
-        assert self.all_shapes_inferred_
-        if edge not in self.known_vi_:
-            print("Cannot retrieve the shape of " + str(edge))
-            return None
-
-        type_proto = self.known_vi_[edge].type
-        shape = get_shape_from_type_proto(type_proto)
-
-        if shape is not None:
-            for i, dim in enumerate(shape):
-                if isinstance(dim, str) and dim in self.dynamic_axis_mapping_:
-                    shape[i] = self.dynamic_axis_mapping_[dim]
-
-        return shape
-
-    def compare_shape(self, edge, edge_other):
-        """Compare shape of two edges.
-
-        Args:
-            edge (str): name of edge
-            edge_other (str): name of another edge
-
-        Raises:
-            Exception: At least one shape is missed for edges to compare
-
-        Returns:
-            bool: whether the shape is same or not
-        """
-        assert self.all_shapes_inferred_
-        shape = self.get_edge_shape(edge)
-        shape_other = self.get_edge_shape(edge_other)
-        if shape is None or shape_other is None:
-            raise Exception("At least one shape is missed for edges to compare")
-        return shape == shape_other
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
deleted file mode 100644
index 2311ad57fdefa502a9e6d7edf44dc884c843ee51..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
+++ /dev/null
@@ -1,2805 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-# -*- coding: UTF-8 -*-
-import argparse
-import logging
-
-import numpy as np
-import onnx
-import sympy
-from onnx import helper, numpy_helper, shape_inference
-from packaging import version
-
-assert version.parse(onnx.__version__) >= version.parse("1.8.0")
-
-logger = logging.getLogger(__name__)
-
-
-def get_attribute(node, attr_name, default_value=None):
-    found = [attr for attr in node.attribute if attr.name == attr_name]
-    if found:
-        return helper.get_attribute_value(found[0])
-    return default_value
-
-
-def get_dim_from_proto(dim):
-    return (
-        getattr(dim, dim.WhichOneof("value"))
-        if type(dim.WhichOneof("value")) == str
-        else None
-    )
-
-
-def is_sequence(type_proto):
-    cls_type = type_proto.WhichOneof("value")
-    assert cls_type in ["tensor_type", "sequence_type"]
-    return cls_type == "sequence_type"
-
-
-def get_shape_from_type_proto(type_proto):
-    assert not is_sequence(type_proto)
-    if type_proto.tensor_type.HasField("shape"):
-        return [get_dim_from_proto(d) for d in type_proto.tensor_type.shape.dim]
-    else:
-        return None  # note no shape is different from shape without dim (scalar)
-
-
-def get_shape_from_value_info(vi):
-    cls_type = vi.type.WhichOneof("value")
-    if cls_type is None:
-        return None
-    if is_sequence(vi.type):
-        if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"):
-            return get_shape_from_type_proto(vi.type.sequence_type.elem_type)
-        else:
-            return None
-    else:
-        return get_shape_from_type_proto(vi.type)
-
-
-def make_named_value_info(name):
-    vi = onnx.ValueInfoProto()
-    vi.name = name
-    return vi
-
-
-def get_shape_from_sympy_shape(sympy_shape):
-    return [
-        None if i is None else (int(i) if is_literal(i) else str(i))
-        for i in sympy_shape
-    ]
-
-
-def is_literal(dim):
-    return type(dim) in [int, np.int64, np.int32, sympy.Integer] or (
-        hasattr(dim, "is_number") and dim.is_number
-    )
-
-
-def handle_negative_axis(axis, rank):
-    assert axis < rank and axis >= -rank
-    return axis if axis >= 0 else rank + axis
-
-
-def get_opset(mp, domain=None):
-    domain = domain or ["", "onnx", "ai.onnx"]
-    if type(domain) != list:
-        domain = [domain]
-    for opset in mp.opset_import:
-        if opset.domain in domain:
-            return opset.version
-
-    return None
-
-
-def as_scalar(x):
-    if type(x) == list:
-        assert len(x) == 1
-        return x[0]
-    elif type(x) == np.ndarray:
-        return x.item()
-    else:
-        return x
-
-
-def as_list(x, keep_none):
-    if type(x) == list:
-        return x
-    elif type(x) == np.ndarray:
-        return list(x)
-    elif keep_none and x is None:
-        return None
-    else:
-        return [x]
-
-
-def sympy_reduce_product(x):
-    if type(x) == list:
-        value = sympy.Integer(1)
-        for v in x:
-            value = value * v
-    else:
-        value = x
-    return value
-
-
-class SymbolicShapeInference:
-    def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
-        self.dispatcher_ = {
-            "Add": self._infer_symbolic_compute_ops,
-            "ArrayFeatureExtractor": self._infer_ArrayFeatureExtractor,
-            "AveragePool": self._infer_Pool,
-            "BatchNormalization": self._infer_BatchNormalization,
-            "Cast": self._infer_Cast,
-            "CategoryMapper": self._infer_CategoryMapper,
-            "Compress": self._infer_Compress,
-            "Concat": self._infer_Concat,
-            "ConcatFromSequence": self._infer_ConcatFromSequence,
-            "Constant": self._infer_Constant,
-            "ConstantOfShape": self._infer_ConstantOfShape,
-            "Conv": self._infer_Conv,
-            "CumSum": self._pass_on_shape_and_type,
-            "Div": self._infer_symbolic_compute_ops,
-            "Einsum": self._infer_Einsum,
-            "Expand": self._infer_Expand,
-            "Equal": self._infer_symbolic_compute_ops,
-            "Floor": self._infer_symbolic_compute_ops,
-            "Gather": self._infer_Gather,
-            "GatherElements": self._infer_GatherElements,
-            "GatherND": self._infer_GatherND,
-            "Identity": self._pass_on_shape_and_type,
-            "If": self._infer_If,
-            "Loop": self._infer_Loop,
-            "MatMul": self._infer_MatMul,
-            "MatMulInteger16": self._infer_MatMulInteger,
-            "MaxPool": self._infer_Pool,
-            "Max": self._infer_symbolic_compute_ops,
-            "Min": self._infer_symbolic_compute_ops,
-            "Mul": self._infer_symbolic_compute_ops,
-            "NonMaxSuppression": self._infer_NonMaxSuppression,
-            "NonZero": self._infer_NonZero,
-            "OneHot": self._infer_OneHot,
-            "Pad": self._infer_Pad,
-            "Range": self._infer_Range,
-            "Reciprocal": self._pass_on_shape_and_type,
-            "ReduceSum": self._infer_ReduceSum,
-            "ReduceProd": self._infer_ReduceProd,
-            "Reshape": self._infer_Reshape,
-            "Resize": self._infer_Resize,
-            "Round": self._pass_on_shape_and_type,
-            "Scan": self._infer_Scan,
-            "ScatterElements": self._infer_ScatterElements,
-            "SequenceAt": self._infer_SequenceAt,
-            "SequenceInsert": self._infer_SequenceInsert,
-            "Shape": self._infer_Shape,
-            "Size": self._infer_Size,
-            "Slice": self._infer_Slice,
-            "SoftmaxCrossEntropyLoss": self._infer_SoftmaxCrossEntropyLoss,
-            "SoftmaxCrossEntropyLossInternal": self._infer_SoftmaxCrossEntropyLoss,
-            "NegativeLogLikelihoodLossInternal": self._infer_SoftmaxCrossEntropyLoss,
-            "Split": self._infer_Split,
-            "SplitToSequence": self._infer_SplitToSequence,
-            "Squeeze": self._infer_Squeeze,
-            "Sub": self._infer_symbolic_compute_ops,
-            "Tile": self._infer_Tile,
-            "TopK": self._infer_TopK,
-            "Transpose": self._infer_Transpose,
-            "Unsqueeze": self._infer_Unsqueeze,
-            "Where": self._infer_symbolic_compute_ops,
-            "ZipMap": self._infer_ZipMap,
-            "Neg": self._infer_symbolic_compute_ops,
-            # contrib ops:
-            "Attention": self._infer_Attention,
-            "BiasGelu": self._infer_BiasGelu,
-            "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
-            "FastGelu": self._infer_FastGelu,
-            "Gelu": self._infer_Gelu,
-            "LayerNormalization": self._infer_LayerNormalization,
-            "LongformerAttention": self._infer_LongformerAttention,
-            "PythonOp": self._infer_PythonOp,
-            "SkipLayerNormalization": self._infer_SkipLayerNormalization,
-        }
-        self.aten_op_dispatcher_ = {
-            "embedding": self._infer_Gather,
-            "bitwise_or": self._infer_aten_bitwise_or,
-            "diagonal": self._infer_aten_diagonal,
-            "max_pool2d_with_indices": self._infer_aten_pool2d,
-            "max": self._infer_aten_minmax,
-            "min": self._infer_aten_minmax,
-            "multinomial": self._infer_aten_multinomial,
-            "unfold": self._infer_aten_unfold,
-            "argmax": self._infer_aten_argmax,
-            "avg_pool2d": self._infer_aten_pool2d,
-            "_adaptive_avg_pool2d": self._infer_aten_pool2d,
-            "numpy_T": self._infer_Transpose,
-        }
-        self.run_ = True
-        self.suggested_merge_ = {}
-        self.symbolic_dims_ = {}
-        self.input_symbols_ = {}
-        self.auto_merge_ = auto_merge
-        self.guess_output_rank_ = guess_output_rank
-        self.verbose_ = verbose
-        self.int_max_ = int_max
-        self.subgraph_id_ = 0
-        self.prefix_ = prefix
-
-    def _add_suggested_merge(self, symbols, apply=False):
-        assert all(
-            [
-                (type(s) == str and s in self.symbolic_dims_) or is_literal(s)
-                for s in symbols
-            ]
-        )
-        symbols = set(symbols)
-        for k, v in self.suggested_merge_.items():
-            if k in symbols:
-                symbols.remove(k)
-                symbols.add(v)
-        map_to = None
-        # if there is literal, map to it first
-        for s in symbols:
-            if is_literal(s):
-                map_to = s
-                break
-        # when no literals, map to input symbolic dims, then existing symbolic dims
-        if map_to is None:
-            for s in symbols:
-                if s in self.input_symbols_:
-                    map_to = s
-                    break
-        if map_to is None:
-            for s in symbols:
-                if type(self.symbolic_dims_[s]) == sympy.Symbol:
-                    map_to = s
-                    break
-        # when nothing to map to, use the shorter one
-        if map_to is None:
-            if self.verbose_ > 0:
-                logger.warning(
-                    "Potential unsafe merge between symbolic expressions: ({})".format(
-                        ",".join(symbols)
-                    )
-                )
-            symbols_list = list(symbols)
-            lens = [len(s) for s in symbols_list]
-            map_to = symbols_list[lens.index(min(lens))]
-            symbols.remove(map_to)
-
-        for s in symbols:
-            if s == map_to:
-                continue
-            if is_literal(map_to) and is_literal(s):
-                assert int(map_to) == int(s)
-            self.suggested_merge_[s] = int(map_to) if is_literal(map_to) else map_to
-            for k, v in self.suggested_merge_.items():
-                if v == s:
-                    self.suggested_merge_[k] = map_to
-        if apply and self.auto_merge_:
-            self._apply_suggested_merge()
-
-    def _apply_suggested_merge(self, graph_input_only=False):
-        if not self.suggested_merge_:
-            return
-        for i in list(self.out_mp_.graph.input) + (
-            [] if graph_input_only else list(self.out_mp_.graph.value_info)
-        ):
-            for d in i.type.tensor_type.shape.dim:
-                if d.dim_param in self.suggested_merge_:
-                    v = self.suggested_merge_[d.dim_param]
-                    if is_literal(v):
-                        d.dim_value = int(v)
-                    else:
-                        d.dim_param = v
-
-    def _preprocess(self, in_mp):
-        self.out_mp_ = onnx.ModelProto()
-        self.out_mp_.CopyFrom(in_mp)
-        self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
-        self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
-        self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
-        self.known_vi_.update(
-            dict(
-                [
-                    (
-                        i.name,
-                        helper.make_tensor_value_info(
-                            i.name, i.data_type, list(i.dims)
-                        ),
-                    )
-                    for i in self.out_mp_.graph.initializer
-                ]
-            )
-        )
-
-    def _merge_symbols(self, dims):
-        if not all([type(d) == str for d in dims]):
-            if self.auto_merge_:
-                unique_dims = list(set(dims))
-                is_int = [is_literal(d) for d in unique_dims]
-                assert (
-                    sum(is_int) <= 1
-                )  # if there are more than 1 unique ints, something is wrong
-                if sum(is_int) == 1:
-                    int_dim = is_int.index(1)
-                    if self.verbose_ > 0:
-                        logger.debug(
-                            "dim {} has been merged with value {}".format(
-                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
-                                unique_dims[int_dim],
-                            )
-                        )
-                    self._check_merged_dims(unique_dims, allow_broadcast=False)
-                    return unique_dims[int_dim]
-                else:
-                    if self.verbose_ > 0:
-                        logger.debug(
-                            "dim {} has been mergd with dim {}".format(
-                                unique_dims[1:], unique_dims[0]
-                            )
-                        )
-                    return dims[0]
-            else:
-                return None
-        if all([d == dims[0] for d in dims]):
-            return dims[0]
-        merged = [
-            self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims
-        ]
-        if all([d == merged[0] for d in merged]):
-            assert merged[0] in self.symbolic_dims_
-            return merged[0]
-        else:
-            return None
-
-    # broadcast from right to left, and merge symbolic dims if needed
-    def _broadcast_shapes(self, shape1, shape2):
-        new_shape = []
-        rank1 = len(shape1)
-        rank2 = len(shape2)
-        new_rank = max(rank1, rank2)
-        for i in range(new_rank):
-            dim1 = shape1[rank1 - 1 - i] if i < rank1 else 1
-            dim2 = shape2[rank2 - 1 - i] if i < rank2 else 1
-            if dim1 == 1 or dim1 == dim2:
-                new_dim = dim2
-            elif dim2 == 1:
-                new_dim = dim1
-            else:
-                new_dim = self._merge_symbols([dim1, dim2])
-                if not new_dim:
-                    # warning about unsupported broadcast when not auto merge
-                    # note that auto merge has the risk of incorrectly merge symbols while one of them being 1
-                    # for example, 'a' = 1, 'b' = 5 at runtime is valid broadcasting, but with auto merge 'a' == 'b'
-                    if self.auto_merge_:
-                        self._add_suggested_merge([dim1, dim2], apply=True)
-                    else:
-                        logger.warning(
-                            "unsupported broadcast between "
-                            + str(dim1)
-                            + " "
-                            + str(dim2)
-                        )
-            new_shape = [new_dim] + new_shape
-        return new_shape
-
-    def _get_shape(self, node, idx):
-        name = node.input[idx]
-        if name in self.known_vi_:
-            vi = self.known_vi_[name]
-            return get_shape_from_value_info(vi)
-        else:
-            assert name in self.initializers_
-            return list(self.initializers_[name].dims)
-
-    def _get_shape_rank(self, node, idx):
-        return len(self._get_shape(node, idx))
-
-    def _get_sympy_shape(self, node, idx):
-        sympy_shape = []
-        for d in self._get_shape(node, idx):
-            if type(d) == str:
-                sympy_shape.append(
-                    self.symbolic_dims_[d]
-                    if d in self.symbolic_dims_
-                    else sympy.Symbol(d, integer=True, nonnegative=True)
-                )
-            else:
-                assert None != d
-                sympy_shape.append(d)
-        return sympy_shape
-
-    def _get_value(self, node, idx):
-        name = node.input[idx]
-        assert name in self.sympy_data_ or name in self.initializers_
-        return (
-            self.sympy_data_[name]
-            if name in self.sympy_data_
-            else numpy_helper.to_array(self.initializers_[name])
-        )
-
-    def _try_get_value(self, node, idx):
-        if idx >= len(node.input):
-            return None
-        name = node.input[idx]
-        if name in self.sympy_data_ or name in self.initializers_:
-            return self._get_value(node, idx)
-        return None
-
-    def _update_computed_dims(self, new_sympy_shape):
-        for i, new_dim in enumerate(new_sympy_shape):
-            if not is_literal(new_dim) and not type(new_dim) == str:
-                str_dim = str(new_dim)
-                if str_dim in self.suggested_merge_:
-                    if is_literal(self.suggested_merge_[str_dim]):
-                        continue  # no need to create dim for literals
-                    new_sympy_shape[i] = self.symbolic_dims_[
-                        self.suggested_merge_[str_dim]
-                    ]
-                else:
-                    # add new_dim if it's a computational expression
-                    if not str(new_dim) in self.symbolic_dims_:
-                        self.symbolic_dims_[str(new_dim)] = new_dim
-
-    def _onnx_infer_single_node(self, node):
-        # skip onnx shape inference for some ops, as they are handled in _infer_*
-        skip_infer = node.op_type in [
-            "If",
-            "Loop",
-            "Scan",
-            "SplitToSequence",
-            "ZipMap",  # contrib ops
-            "Attention",
-            "BiasGelu",
-            "EmbedLayerNormalization",
-            "FastGelu",
-            "Gelu",
-            "LayerNormalization",
-            "LongformerAttention",
-            "SkipLayerNormalization",
-            "PythonOp",
-        ]
-
-        if not skip_infer:
-            # Only pass initializers that satisfy the following condition:
-            # (1) Operator need value of some input for shape inference.
-            #     For example, Unsqueeze in opset 13 uses the axes input to calculate shape of output.
-            # (2) opset version >= 9. In older version, initializer is required in graph input by onnx spec.
-            # (3) The initializer is not in graph input. The means the node input is "constant" in inference.
-            initializers = []
-            if (get_opset(self.out_mp_) >= 9) and node.op_type in ["Unsqueeze"]:
-                initializers = [
-                    self.initializers_[name]
-                    for name in node.input
-                    if (name in self.initializers_ and name not in self.graph_inputs_)
-                ]
-
-            # run single node inference with self.known_vi_ shapes
-            tmp_graph = helper.make_graph(
-                [node],
-                "tmp",
-                [self.known_vi_[i] for i in node.input if i],
-                [make_named_value_info(i) for i in node.output],
-                initializers,
-            )
-
-            self.tmp_mp_.graph.CopyFrom(tmp_graph)
-
-            self.tmp_mp_ = shape_inference.infer_shapes(self.tmp_mp_)
-
-        for i_o in range(len(node.output)):
-            o = node.output[i_o]
-            vi = self.out_mp_.graph.value_info.add()
-            if not skip_infer:
-                vi.CopyFrom(self.tmp_mp_.graph.output[i_o])
-            else:
-                vi.name = o
-            self.known_vi_[o] = vi
-
-    def _onnx_infer_subgraph(
-        self, node, subgraph, use_node_input=True, inc_subgraph_id=True
-    ):
-        if self.verbose_ > 2:
-            logger.debug(
-                "Inferencing subgraph of node {} with output({}...): {}".format(
-                    node.name, node.output[0], node.op_type
-                )
-            )
-        # node inputs are not passed directly to the subgraph
-        # it's up to the node dispatcher to prepare subgraph input
-        # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
-        # besides, inputs in subgraph could shadow implicit inputs
-        subgraph_inputs = set(
-            [i.name for i in list(subgraph.initializer) + list(subgraph.input)]
-        )
-        subgraph_implicit_input = set(
-            [name for name in self.known_vi_.keys() if not name in subgraph_inputs]
-        )
-        tmp_graph = helper.make_graph(
-            list(subgraph.node),
-            "tmp",
-            list(subgraph.input) + [self.known_vi_[i] for i in subgraph_implicit_input],
-            [make_named_value_info(i.name) for i in subgraph.output],
-        )
-        tmp_graph.initializer.extend(
-            [
-                i
-                for i in self.out_mp_.graph.initializer
-                if i.name in subgraph_implicit_input
-            ]
-        )
-        tmp_graph.initializer.extend(subgraph.initializer)
-        self.tmp_mp_.graph.CopyFrom(tmp_graph)
-
-        symbolic_shape_inference = SymbolicShapeInference(
-            self.int_max_,
-            self.auto_merge_,
-            self.guess_output_rank_,
-            self.verbose_,
-            prefix=self.prefix_ + "_" + str(self.subgraph_id_),
-        )
-        if inc_subgraph_id:
-            self.subgraph_id_ += 1
-
-        all_shapes_inferred = False
-        symbolic_shape_inference._preprocess(self.tmp_mp_)
-        symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
-        while symbolic_shape_inference.run_:
-            all_shapes_inferred = symbolic_shape_inference._infer_impl(
-                self.sympy_data_.copy()
-            )
-        symbolic_shape_inference._update_output_from_vi()
-        if use_node_input:
-            # if subgraph uses node input, it needs to update to merged dims
-            subgraph.ClearField("input")
-            subgraph.input.extend(
-                symbolic_shape_inference.out_mp_.graph.input[: len(node.input)]
-            )
-        subgraph.ClearField("output")
-        subgraph.output.extend(symbolic_shape_inference.out_mp_.graph.output)
-        subgraph.ClearField("value_info")
-        subgraph.value_info.extend(symbolic_shape_inference.out_mp_.graph.value_info)
-        subgraph.ClearField("node")
-        subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node)
-        # for new symbolic dims from subgraph output, add to main graph symbolic dims
-        subgraph_shapes = [
-            get_shape_from_value_info(o)
-            for o in symbolic_shape_inference.out_mp_.graph.output
-        ]
-        subgraph_new_symbolic_dims = set(
-            [
-                d
-                for s in subgraph_shapes
-                if s
-                for d in s
-                if type(d) == str and not d in self.symbolic_dims_
-            ]
-        )
-        new_dims = {}
-        for d in subgraph_new_symbolic_dims:
-            assert d in symbolic_shape_inference.symbolic_dims_
-            new_dims[d] = symbolic_shape_inference.symbolic_dims_[d]
-        self.symbolic_dims_.update(new_dims)
-        return symbolic_shape_inference
-
-    def _get_int_values(self, node, broadcast=False):
-        values = [self._try_get_value(node, i) for i in range(len(node.input))]
-        if all([v is not None for v in values]):
-            # some shape compute is in floating point, cast to int for sympy
-            for i, v in enumerate(values):
-                if type(v) != np.ndarray:
-                    continue
-                if len(v.shape) > 1:
-                    new_v = None  # ignore value for rank > 1
-                elif len(v.shape) == 0:
-                    new_v = int(v.item())
-                else:
-                    assert len(v.shape) == 1
-                    new_v = [int(vv) for vv in v]
-                values[i] = new_v
-        values_len = [len(v) if type(v) == list else 0 for v in values]
-        max_len = max(values_len)
-        if max_len >= 1 and broadcast:
-            # broadcast
-            for i, v in enumerate(values):
-                if v is None:
-                    continue  # don't broadcast if value is unknown
-                if type(v) == list:
-                    if len(v) < max_len:
-                        values[i] = v * max_len
-                    else:
-                        assert len(v) == max_len
-                else:
-                    values[i] = [v] * max_len
-        return values
-
-    def _compute_on_sympy_data(self, node, op_func):
-        assert len(node.output) == 1
-        values = self._get_int_values(node, broadcast=True)
-        if all([v is not None for v in values]):
-            is_list = [type(v) == list for v in values]
-            as_list = any(is_list)
-            if as_list:
-                self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)]
-            else:
-                self.sympy_data_[node.output[0]] = op_func(values)
-
-    def _pass_on_sympy_data(self, node):
-        assert len(node.input) == 1 or node.op_type in [
-            "Reshape",
-            "Unsqueeze",
-            "Squeeze",
-        ]
-        self._compute_on_sympy_data(node, lambda x: x[0])
-
-    def _pass_on_shape_and_type(self, node):
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                self._get_shape(node, 0),
-            )
-        )
-
-    def _new_symbolic_dim(self, prefix, dim):
-        new_dim = "{}_d{}".format(prefix, dim)
-        if new_dim in self.suggested_merge_:
-            v = self.suggested_merge_[new_dim]
-            new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v
-        else:
-            new_symbolic_dim = sympy.Symbol(new_dim, integer=True, nonnegative=True)
-            self.symbolic_dims_[new_dim] = new_symbolic_dim
-        return new_symbolic_dim
-
-    def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
-        return self._new_symbolic_dim(
-            "{}{}_{}_o{}_".format(
-                node.op_type,
-                self.prefix_,
-                list(self.out_mp_.graph.node).index(node),
-                out_idx,
-            ),
-            dim,
-        )
-
-    def _new_symbolic_shape(self, rank, node, out_idx=0):
-        return [
-            self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)
-        ]
-
-    def _compute_conv_pool_shape(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        if len(node.input) > 1:
-            W_shape = self._get_sympy_shape(node, 1)
-            rank = len(W_shape) - 2  # number of spatial axes
-            kernel_shape = W_shape[-rank:]
-            sympy_shape[1] = W_shape[0]
-        else:
-            W_shape = None
-            kernel_shape = get_attribute(node, "kernel_shape")
-            rank = len(kernel_shape)
-
-        assert len(sympy_shape) == rank + 2
-
-        # only need to symbolic shape inference if input has symbolic dims in spatial axes
-        is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]]
-
-        if not any(is_symbolic_dims):
-            shape = get_shape_from_value_info(self.known_vi_[node.output[0]])
-            if len(shape) > 0:
-                assert len(sympy_shape) == len(shape)
-                sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]]
-                return sympy_shape
-
-        dilations = get_attribute(node, "dilations", [1] * rank)
-        strides = get_attribute(node, "strides", [1] * rank)
-        effective_kernel_shape = [
-            (k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)
-        ]
-        pads = get_attribute(node, "pads")
-        if pads is None:
-            pads = [0] * (2 * rank)
-            auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8")
-            if auto_pad != "VALID" and auto_pad != "NOTSET":
-                try:
-                    residual = [
-                        sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides)
-                    ]
-                    total_pads = [
-                        max(0, (k - s) if r == 0 else (k - r))
-                        for k, s, r in zip(effective_kernel_shape, strides, residual)
-                    ]
-                except TypeError:  # sympy may throw TypeError: cannot determine truth value of Relational
-                    total_pads = [
-                        max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides)
-                    ]  # assuming no residual if sympy throws error
-            elif auto_pad == "VALID":
-                total_pads = []
-            else:
-                total_pads = [0] * rank
-        else:
-            assert len(pads) == 2 * rank
-            total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])]
-
-        ceil_mode = get_attribute(node, "ceil_mode", 0)
-        for i in range(rank):
-            effective_input_size = sympy_shape[-rank + i]
-            if len(total_pads) > 0:
-                effective_input_size = effective_input_size + total_pads[i]
-            if ceil_mode:
-                strided_kernel_positions = sympy.ceiling(
-                    (effective_input_size - effective_kernel_shape[i]) / strides[i]
-                )
-            else:
-                strided_kernel_positions = (
-                    effective_input_size - effective_kernel_shape[i]
-                ) // strides[i]
-            sympy_shape[-rank + i] = strided_kernel_positions + 1
-        return sympy_shape
-
-    def _check_merged_dims(self, dims, allow_broadcast=True):
-        if allow_broadcast:
-            dims = [d for d in dims if not (is_literal(d) and int(d) <= 1)]
-        if not all([d == dims[0] for d in dims]):
-            self._add_suggested_merge(dims, apply=True)
-
-    def _compute_matmul_shape(self, node, output_dtype=None):
-        lhs_shape = self._get_shape(node, 0)
-        rhs_shape = self._get_shape(node, 1)
-        lhs_rank = len(lhs_shape)
-        rhs_rank = len(rhs_shape)
-        lhs_reduce_dim = 0
-        rhs_reduce_dim = 0
-        assert lhs_rank > 0 and rhs_rank > 0
-        if lhs_rank == 1 and rhs_rank == 1:
-            new_shape = []
-        elif lhs_rank == 1:
-            rhs_reduce_dim = -2
-            new_shape = rhs_shape[:rhs_reduce_dim] + [rhs_shape[-1]]
-        elif rhs_rank == 1:
-            lhs_reduce_dim = -1
-            new_shape = lhs_shape[:lhs_reduce_dim]
-        else:
-            lhs_reduce_dim = -1
-            rhs_reduce_dim = -2
-            new_shape = (
-                self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2])
-                + [lhs_shape[-2]]
-                + [rhs_shape[-1]]
-            )
-        # merge reduce dim
-        self._check_merged_dims(
-            [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
-            allow_broadcast=False,
-        )
-        if output_dtype is None:
-            # infer output_dtype from input type when not specified
-            output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(node.output[0], output_dtype, new_shape)
-        )
-
-    def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
-        """
-        update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches
-        """
-        dst_tensor_type = (
-            dst_type.sequence_type.elem_type.tensor_type
-            if is_sequence(dst_type)
-            else dst_type.tensor_type
-        )
-        src_tensor_type = (
-            src_type.sequence_type.elem_type.tensor_type
-            if is_sequence(src_type)
-            else src_type.tensor_type
-        )
-        if dst_tensor_type.elem_type != src_tensor_type.elem_type:
-            node_id = node.name if node.name else node.op_type
-            raise ValueError(
-                f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: "
-                f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs "
-                f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}"
-            )
-        if dst_tensor_type.HasField("shape"):
-            for di, ds in enumerate(
-                zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)
-            ):
-                if ds[0] != ds[1]:
-                    # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type
-                    # for sequence_type, clear the dimension
-                    new_dim = onnx.TensorShapeProto.Dimension()
-                    if not is_sequence(dst_type):
-                        new_dim.dim_param = str(
-                            self._new_symbolic_dim_from_output(node, out_idx, di)
-                        )
-                    dst_tensor_type.shape.dim[di].CopyFrom(new_dim)
-        else:
-            dst_tensor_type.CopyFrom(src_tensor_type)
-
-    def _infer_ArrayFeatureExtractor(self, node):
-        data_shape = self._get_shape(node, 0)
-        indices_shape = self._get_shape(node, 1)
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                data_shape[:-1] + indices_shape,
-            )
-        )
-
-    def _infer_symbolic_compute_ops(self, node):
-        funcs = {
-            "Add": lambda l: l[0] + l[1],
-            "Div": lambda l: l[0] // l[1],  # integer div in sympy
-            "Equal": lambda l: l[0] == l[1],
-            "Floor": lambda l: sympy.floor(l[0]),
-            "Max": lambda l: l[1]
-            if is_literal(l[0]) and int(l[0]) < -self.int_max_
-            else (
-                l[0]
-                if is_literal(l[1]) and int(l[1]) < -self.int_max_
-                else sympy.Max(l[0], l[1])
-            ),
-            "Min": lambda l: l[1]
-            if is_literal(l[0]) and int(l[0]) > self.int_max_
-            else (
-                l[0]
-                if is_literal(l[1]) and int(l[1]) > self.int_max_
-                else sympy.Min(l[0], l[1])
-            ),
-            "Mul": lambda l: l[0] * l[1],
-            "Sub": lambda l: l[0] - l[1],
-            "Where": lambda l: l[1] if l[0] else l[2],
-            "Neg": lambda l: -l[0],
-        }
-        assert node.op_type in funcs
-        self._compute_on_sympy_data(node, funcs[node.op_type])
-
-    def _infer_Cast(self, node):
-        self._pass_on_sympy_data(node)
-
-    def _infer_CategoryMapper(self, node):
-        input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-        if input_type == onnx.TensorProto.STRING:
-            output_type = onnx.TensorProto.INT64
-        else:
-            output_type = onnx.TensorProto.STRING
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0], output_type, self._get_shape(node, 0)
-            )
-        )
-
-    def _infer_Compress(self, node):
-        input_shape = self._get_shape(node, 0)
-        # create a new symbolic dimension for Compress output
-        compress_len = str(self._new_symbolic_dim_from_output(node))
-        axis = get_attribute(node, "axis")
-        if axis == None:
-            # when axis is not specified, input is flattened before compress so output is 1D
-            output_shape = [compress_len]
-        else:
-            output_shape = input_shape
-            output_shape[handle_negative_axis(axis, len(input_shape))] = compress_len
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                output_shape,
-            )
-        )
-
-    def _infer_Concat(self, node):
-        if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]):
-            values = self._get_int_values(node)
-            if all([v is not None for v in values]):
-                assert 0 == get_attribute(node, "axis")
-                self.sympy_data_[node.output[0]] = []
-                for i in range(len(node.input)):
-                    value = values[i]
-                    if type(value) == list:
-                        self.sympy_data_[node.output[0]].extend(value)
-                    else:
-                        self.sympy_data_[node.output[0]].append(value)
-
-        sympy_shape = self._get_sympy_shape(node, 0)
-        axis = handle_negative_axis(get_attribute(node, "axis"), len(sympy_shape))
-        for i_idx in range(1, len(node.input)):
-            input_shape = self._get_sympy_shape(node, i_idx)
-            if input_shape:
-                sympy_shape[axis] = sympy_shape[axis] + input_shape[axis]
-        self._update_computed_dims(sympy_shape)
-        # merge symbolic dims for non-concat axes
-        for d in range(len(sympy_shape)):
-            if d == axis:
-                continue
-            dims = [
-                self._get_shape(node, i_idx)[d]
-                for i_idx in range(len(node.input))
-                if self._get_shape(node, i_idx)
-            ]
-            if all([d == dims[0] for d in dims]):
-                continue
-            merged = self._merge_symbols(dims)
-            if type(merged) == str:
-                sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
-            else:
-                sympy_shape[d] = merged
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                get_shape_from_sympy_shape(sympy_shape),
-            )
-        )
-
-    def _infer_ConcatFromSequence(self, node):
-        seq_shape = self._get_shape(node, 0)
-        new_axis = 1 if get_attribute(node, "new_axis") else 0
-        axis = handle_negative_axis(
-            get_attribute(node, "axis"), len(seq_shape) + new_axis
-        )
-        concat_dim = str(self._new_symbolic_dim_from_output(node, 0, axis))
-        new_shape = seq_shape
-        if new_axis:
-            new_shape = seq_shape[:axis] + [concat_dim] + seq_shape[axis:]
-        else:
-            new_shape[axis] = concat_dim
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[
-                    node.input[0]
-                ].type.sequence_type.elem_type.tensor_type.elem_type,
-                new_shape,
-            )
-        )
-
-    def _infer_Constant(self, node):
-        t = get_attribute(node, "value")
-        self.sympy_data_[node.output[0]] = numpy_helper.to_array(t)
-
-    def _infer_ConstantOfShape(self, node):
-        sympy_shape = self._get_int_values(node)[0]
-        vi = self.known_vi_[node.output[0]]
-        if sympy_shape is not None:
-            if type(sympy_shape) != list:
-                sympy_shape = [sympy_shape]
-            self._update_computed_dims(sympy_shape)
-            # update sympy data if output type is int, and shape is known
-            if vi.type.tensor_type.elem_type == onnx.TensorProto.INT64 and all(
-                [is_literal(x) for x in sympy_shape]
-            ):
-                self.sympy_data_[node.output[0]] = np.ones(
-                    [int(x) for x in sympy_shape], dtype=np.int64
-                ) * numpy_helper.to_array(get_attribute(node, "value", 0))
-        else:
-            # create new dynamic shape
-            # note input0 is a 1D vector of shape, the new symbolic shape has the rank of the shape vector length
-            sympy_shape = self._new_symbolic_shape(self._get_shape(node, 0)[0], node)
-
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                vi.type.tensor_type.elem_type,
-                get_shape_from_sympy_shape(sympy_shape),
-            )
-        )
-
-    def _infer_Conv(self, node):
-        sympy_shape = self._compute_conv_pool_shape(node)
-        self._update_computed_dims(sympy_shape)
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                vi.type.tensor_type.elem_type,
-                get_shape_from_sympy_shape(sympy_shape),
-            )
-        )
-
-    def _infer_Einsum(self, node):
-        # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
-        equation = get_attribute(node, "equation")
-        equation = equation.replace(b" ", b"")
-        mid_index = equation.find(b"->")
-        left_equation = equation[:mid_index] if mid_index != -1 else equation
-
-        num_operands = 0
-        num_ellipsis = 0
-        num_ellipsis_indices = 0
-
-        letter_to_dim = {}
-
-        terms = left_equation.split(b",")
-        for term in terms:
-            ellipsis_index = term.find(b"...")
-            shape = self._get_shape(node, num_operands)
-            rank = len(shape)
-            if ellipsis_index != -1:
-                if num_ellipsis == 0:
-                    num_ellipsis_indices = rank - len(term) + 3
-                num_ellipsis = num_ellipsis + 1
-            for i in range(1, rank + 1):
-                letter = term[-i]
-                if letter != 46:  # letter != b'.'
-                    dim = shape[-i]
-                    if letter not in letter_to_dim.keys():
-                        letter_to_dim[letter] = dim
-                    elif type(dim) != sympy.Symbol:
-                        letter_to_dim[letter] = dim
-            num_operands = num_operands + 1
-
-        new_sympy_shape = []
-        from collections import OrderedDict
-
-        num_letter_occurrences = OrderedDict()
-        if mid_index != -1:
-            right_equation = equation[mid_index + 2 :]
-            right_ellipsis_index = right_equation.find(b"...")
-            if right_ellipsis_index != -1:
-                for i in range(num_ellipsis_indices):
-                    new_sympy_shape.append(shape[i])
-            for c in right_equation:
-                if c != 46:  # c != b'.'
-                    new_sympy_shape.append(letter_to_dim[c])
-        else:
-            for i in range(num_ellipsis_indices):
-                new_sympy_shape.append(shape[i])
-            for c in left_equation:
-                if c != 44 and c != 46:  # c != b',' and c != b'.':
-                    if c in num_letter_occurrences:
-                        num_letter_occurrences[c] = num_letter_occurrences[c] + 1
-                    else:
-                        num_letter_occurrences[c] = 1
-            for key, value in num_letter_occurrences.items():
-                if value == 1:
-                    new_sympy_shape.append(letter_to_dim[key])
-
-        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape)
-        )
-
-    def _infer_Expand(self, node):
-        expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True)
-        if expand_to_shape is not None:
-            # new_shape's dim can come from shape value
-            self._update_computed_dims(expand_to_shape)
-            shape = self._get_shape(node, 0)
-            new_shape = self._broadcast_shapes(
-                shape, get_shape_from_sympy_shape(expand_to_shape)
-            )
-            vi = self.known_vi_[node.output[0]]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    new_shape,
-                )
-            )
-
-    def _infer_Gather(self, node):
-        data_shape = self._get_shape(node, 0)
-        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape))
-        indices_shape = self._get_shape(node, 1)
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                data_shape[:axis] + indices_shape + data_shape[axis + 1 :],
-            )
-        )
-        # for 1D input, do some sympy compute
-        if (
-            node.input[0] in self.sympy_data_
-            and len(data_shape) == 1
-            and 0 == get_attribute(node, "axis", 0)
-        ):
-            idx = self._try_get_value(node, 1)
-            if idx is not None:
-                data = self.sympy_data_[node.input[0]]
-                if type(data) == list:
-                    if type(idx) == np.ndarray and len(idx.shape) == 1:
-                        self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
-                    else:
-                        self.sympy_data_[node.output[0]] = data[int(idx)]
-                else:
-                    assert idx == 0 or idx == -1
-                    self.sympy_data_[node.output[0]] = data
-
-    def _infer_GatherElements(self, node):
-        indices_shape = self._get_shape(node, 1)
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                indices_shape,
-            )
-        )
-
-    def _infer_GatherND(self, node):
-        data_shape = self._get_shape(node, 0)
-        data_rank = len(data_shape)
-        indices_shape = self._get_shape(node, 1)
-        indices_rank = len(indices_shape)
-        last_index_dimension = indices_shape[-1]
-        assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
-        new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                new_shape,
-            )
-        )
-
-    def _infer_If(self, node):
-        # special case for constant condition, in case there are mismatching shape from the non-executed branch
-        subgraphs = [
-            get_attribute(node, "then_branch"),
-            get_attribute(node, "else_branch"),
-        ]
-        cond = self._try_get_value(node, 0)
-        if cond is not None:
-            if as_scalar(cond) > 0:
-                subgraphs[1].CopyFrom(subgraphs[0])
-            else:
-                subgraphs[0].CopyFrom(subgraphs[1])
-
-        for i_sub, subgraph in enumerate(subgraphs):
-            subgraph_infer = self._onnx_infer_subgraph(
-                node, subgraph, use_node_input=False
-            )
-            for i_out in range(len(node.output)):
-                vi = self.known_vi_[node.output[i_out]]
-                if i_sub == 0:
-                    vi.CopyFrom(subgraph.output[i_out])
-                    vi.name = node.output[i_out]
-                else:
-                    self._fuse_tensor_type(
-                        node, i_out, vi.type, subgraph.output[i_out].type
-                    )
-
-                # pass on sympy data from subgraph, if cond is constant
-                if cond is not None and i_sub == (0 if as_scalar(cond) > 0 else 1):
-                    if subgraph.output[i_out].name in subgraph_infer.sympy_data_:
-                        self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[
-                            subgraph.output[i_out].name
-                        ]
-
-    def _infer_Loop(self, node):
-        subgraph = get_attribute(node, "body")
-        assert len(subgraph.input) == len(node.input)
-        num_loop_carried = (
-            len(node.input) - 2
-        )  # minus the length and initial loop condition
-        # when sequence_type is used as loop carried input
-        # needs to run subgraph infer twice if the tensor shape in sequence contains None
-        for i, si in enumerate(subgraph.input):
-            si_name = si.name
-            si.CopyFrom(self.known_vi_[node.input[i]])
-            si.name = si_name
-
-        self._onnx_infer_subgraph(node, subgraph)
-
-        # check subgraph input/output for shape changes in loop carried variables
-        # for tensor_type, create new symbolic dim when changing, i.e., output = Concat(input, a)
-        # for sequence_type, propagate from output to input
-        need_second_infer = False
-        for i_out in range(1, num_loop_carried + 1):
-            so = subgraph.output[i_out]
-            so_shape = get_shape_from_value_info(so)
-            if is_sequence(so.type):
-                if so_shape and None in so_shape:
-                    # copy shape from output to input
-                    # note that loop input is [loop_len, cond, input_0, input_1, ...]
-                    # while loop output is [cond, output_0, output_1, ...]
-                    subgraph.input[i_out + 1].type.sequence_type.elem_type.CopyFrom(
-                        so.type.sequence_type.elem_type
-                    )
-                    need_second_infer = True
-            else:
-                si = subgraph.input[i_out + 1]
-                si_shape = get_shape_from_value_info(si)
-                for di, dims in enumerate(zip(si_shape, so_shape)):
-                    if dims[0] != dims[1]:
-                        new_dim = onnx.TensorShapeProto.Dimension()
-                        new_dim.dim_param = str(
-                            self._new_symbolic_dim_from_output(node, i_out, di)
-                        )
-                        si.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
-                        so.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
-                        need_second_infer = True
-
-        if need_second_infer:
-            if self.verbose_ > 2:
-                logger.debug(
-                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
-                        node.name, node.output[0]
-                    )
-                )
-            self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
-
-        # create a new symbolic dimension for iteration dependent dimension
-        loop_iter_dim = str(self._new_symbolic_dim_from_output(node))
-        for i in range(len(node.output)):
-            vi = self.known_vi_[node.output[i]]
-            vi.CopyFrom(
-                subgraph.output[i + 1]
-            )  # first subgraph output is condition, not in node output
-            if i >= num_loop_carried:
-                assert not is_sequence(
-                    vi.type
-                )  # TODO: handle loop accumulation in sequence_type
-                subgraph_vi_dim = subgraph.output[i + 1].type.tensor_type.shape.dim
-                vi.type.tensor_type.shape.ClearField("dim")
-                vi_dim = vi.type.tensor_type.shape.dim
-                vi_dim.add().dim_param = loop_iter_dim
-                vi_dim.extend(list(subgraph_vi_dim))
-            vi.name = node.output[i]
-
-    def _infer_MatMul(self, node):
-        self._compute_matmul_shape(node)
-
-    def _infer_MatMulInteger(self, node):
-        self._compute_matmul_shape(node, onnx.TensorProto.INT32)
-
-    def _infer_NonMaxSuppression(self, node):
-        selected = str(self._new_symbolic_dim_from_output(node))
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0], onnx.TensorProto.INT64, [selected, 3]
-            )
-        )
-
-    def _infer_NonZero(self, node):
-        input_rank = self._get_shape_rank(node, 0)
-        # create a new symbolic dimension for NonZero output
-        nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1))
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len]
-            )
-        )
-
-    def _infer_OneHot(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        depth = self._try_get_value(node, 1)
-        axis = get_attribute(node, "axis", -1)
-        axis = handle_negative_axis(axis, len(sympy_shape) + 1)
-        new_shape = get_shape_from_sympy_shape(
-            sympy_shape[:axis]
-            + [
-                self._new_symbolic_dim_from_output(node)
-                if not is_literal(depth)
-                else depth
-            ]
-            + sympy_shape[axis:]
-        )
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[2]].type.tensor_type.elem_type,
-                new_shape,
-            )
-        )
-
-    def _infer_Pad(self, node):
-        if get_opset(self.out_mp_) <= 10:
-            pads = get_attribute(node, "pads")
-        else:
-            pads = self._try_get_value(node, 1)
-
-        sympy_shape = self._get_sympy_shape(node, 0)
-        rank = len(sympy_shape)
-
-        if pads is not None:
-            assert len(pads) == 2 * rank
-            new_sympy_shape = [
-                d + pad_up + pad_down
-                for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])
-            ]
-            self._update_computed_dims(new_sympy_shape)
-        else:
-            # dynamic pads, create new symbolic dimensions
-            new_sympy_shape = self._new_symbolic_shape(rank, node)
-        output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape)
-            )
-        )
-
-    def _infer_Pool(self, node):
-        sympy_shape = self._compute_conv_pool_shape(node)
-        self._update_computed_dims(sympy_shape)
-        for o in node.output:
-            if not o:
-                continue
-            vi = self.known_vi_[o]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    o,
-                    vi.type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(sympy_shape),
-                )
-            )
-
-    def _infer_aten_bitwise_or(self, node):
-        shape0 = self._get_shape(node, 0)
-        shape1 = self._get_shape(node, 1)
-        new_shape = self._broadcast_shapes(shape0, shape1)
-        t0 = self.known_vi_[node.input[0]]
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0], t0.type.tensor_type.elem_type, new_shape
-            )
-        )
-
-    def _infer_aten_diagonal(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        rank = len(sympy_shape)
-        offset = self._try_get_value(node, 1)
-        dim1 = self._try_get_value(node, 2)
-        dim2 = self._try_get_value(node, 3)
-
-        assert offset is not None and dim1 is not None and dim2 is not None
-        dim1 = handle_negative_axis(dim1, rank)
-        dim2 = handle_negative_axis(dim2, rank)
-
-        new_shape = []
-        for dim, val in enumerate(sympy_shape):
-            if dim not in [dim1, dim2]:
-                new_shape.append(val)
-
-        shape1 = sympy_shape[dim1]
-        shape2 = sympy_shape[dim2]
-        if offset >= 0:
-            diag_shape = sympy.Max(0, sympy.Min(shape1, shape2 - offset))
-        else:
-            diag_shape = sympy.Max(0, sympy.Min(shape1 + offset, shape2))
-        new_shape.append(diag_shape)
-
-        if node.output[0]:
-            vi = self.known_vi_[node.output[0]]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(new_shape),
-                )
-            )
-
-    def _infer_aten_multinomial(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        rank = len(sympy_shape)
-        assert rank in [1, 2]
-        num_samples = self._try_get_value(node, 1)
-        di = rank - 1
-        last_dim = (
-            num_samples
-            if num_samples
-            else str(self._new_symbolic_dim_from_output(node, 0, di))
-        )
-        output_shape = sympy_shape[:-1] + [last_dim]
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                onnx.TensorProto.INT64,
-                get_shape_from_sympy_shape(output_shape),
-            )
-        )
-
-    def _infer_aten_pool2d(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        assert len(sympy_shape) == 4
-        sympy_shape[-2:] = [
-            self._new_symbolic_dim_from_output(node, 0, i) for i in [2, 3]
-        ]
-        self._update_computed_dims(sympy_shape)
-        for i, o in enumerate(node.output):
-            if not o:
-                continue
-            vi = self.known_vi_[o]
-            elem_type = (
-                onnx.TensorProto.INT64
-                if i == 1
-                else self.known_vi_[node.input[0]].type.tensor_type.elem_type
-            )
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    o, elem_type, get_shape_from_sympy_shape(sympy_shape)
-                )
-            )
-
-    def _infer_aten_minmax(self, node):
-        vi = self.known_vi_[node.output[0]]
-        if len(node.input) == 1:
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    [],
-                )
-            )
-        else:
-            assert len(node.input) == 3
-            keepdim = self._try_get_value(node, 2)
-            assert keepdim is not None  # can only handle known keepdim case.
-            dim = self._try_get_value(node, 1)
-            if dim is None:
-                rank = self._get_shape_rank(node, 0)
-                output_shape = self._new_symbolic_shape(
-                    rank if keepdim else rank - 1, node
-                )
-            else:
-                shape = self._get_sympy_shape(node, 0)
-                dim = handle_negative_axis(dim, len(shape))
-                output_shape = shape[:dim]
-                if keepdim:
-                    output_shape += [1]
-                output_shape += shape[dim + 1 :]
-
-            output_shape = get_shape_from_sympy_shape(output_shape)
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    output_shape,
-                )
-            )
-            vi1 = self.known_vi_[node.output[1]]
-            vi1.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[1], onnx.TensorProto.INT64, output_shape
-                )
-            )
-
-    def _infer_aten_unfold(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        dimension = self._try_get_value(node, 1)
-        size = self._try_get_value(node, 2)
-        step = self._try_get_value(node, 3)
-        if dimension is not None and size is not None and step is not None:
-            assert dimension < len(sympy_shape)
-            sympy_shape[dimension] = (sympy_shape[dimension] - size) // step + 1
-            sympy_shape.append(size)
-        else:
-            rank = len(sympy_shape)
-            sympy_shape = self._new_symbolic_shape(rank + 1, node)
-        self._update_computed_dims(sympy_shape)
-        if node.output[0]:
-            vi = self.known_vi_[node.output[0]]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(sympy_shape),
-                )
-            )
-
-    def _infer_aten_argmax(self, node):
-        new_shape = None
-        if node.input[1] == "":
-            # The argmax of the flattened input is returned.
-            new_shape = []
-        else:
-            dim = self._try_get_value(node, 1)
-            keepdim = self._try_get_value(node, 2)
-            if keepdim is not None:
-                sympy_shape = self._get_sympy_shape(node, 0)
-                if dim is not None:
-                    dim = handle_negative_axis(dim, len(sympy_shape))
-                    if keepdim:
-                        sympy_shape[dim] = 1
-                    else:
-                        del sympy_shape[dim]
-                else:
-                    rank = len(sympy_shape)
-                    sympy_shape = self._new_symbolic_shape(
-                        rank if keepdim else rank - 1, node
-                    )
-                self._update_computed_dims(sympy_shape)
-                new_shape = get_shape_from_sympy_shape(sympy_shape)
-        if node.output[0] and new_shape is not None:
-            vi = self.known_vi_[node.output[0]]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0], onnx.TensorProto.INT64, new_shape
-                )
-            )
-
-    def _infer_BatchNormalization(self, node):
-        self._propagate_shape_and_type(node)
-
-        # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop
-        for i in [1, 2, 3, 4]:
-            if i < len(node.output) and node.output[i] != "":
-                # all of these parameters have the same shape as the 1st input
-                self._propagate_shape_and_type(node, input_index=1, output_index=i)
-
-    def _infer_Range(self, node):
-        vi = self.known_vi_[node.output[0]]
-        input_data = self._get_int_values(node)
-        if all([i is not None for i in input_data]):
-            start = as_scalar(input_data[0])
-            limit = as_scalar(input_data[1])
-            delta = as_scalar(input_data[2])
-            new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start) / delta), 0)]
-        else:
-            new_sympy_shape = [self._new_symbolic_dim_from_output(node)]
-        self._update_computed_dims(new_sympy_shape)
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                get_shape_from_sympy_shape(new_sympy_shape),
-            )
-        )
-
-    def _infer_ReduceSum(self, node):
-        keep_dims = get_attribute(node, "keepdims", 1)
-        if get_opset(self.out_mp_) >= 13 and len(node.input) > 1:
-            # ReduceSum changes axes to input[1] in opset 13
-            axes = self._try_get_value(node, 1)
-            vi = self.known_vi_[node.output[0]]
-            if axes is None:
-                assert keep_dims  # can only handle keep_dims==True when axes is unknown, by generating new ranks
-                vi.CopyFrom(
-                    helper.make_tensor_value_info(
-                        node.output[0],
-                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                        get_shape_from_sympy_shape(
-                            self._new_symbolic_shape(
-                                self._get_shape_rank(node, 0), node
-                            )
-                        ),
-                    )
-                )
-            else:
-                shape = self._get_shape(node, 0)
-                output_shape = []
-                axes = [handle_negative_axis(a, len(shape)) for a in axes]
-                for i, d in enumerate(shape):
-                    if i in axes:
-                        if keep_dims:
-                            output_shape.append(1)
-                    else:
-                        output_shape.append(d)
-                vi.CopyFrom(
-                    helper.make_tensor_value_info(
-                        node.output[0],
-                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                        output_shape,
-                    )
-                )
-
-    def _infer_ReduceProd(self, node):
-        axes = get_attribute(node, "axes")
-        keep_dims = get_attribute(node, "keepdims", 1)
-        if keep_dims == 0 and axes == [0]:
-            data = self._get_int_values(node)[0]
-            if data is not None:
-                self.sympy_data_[node.output[0]] = sympy_reduce_product(data)
-
-    def _infer_Reshape(self, node):
-        shape_value = self._try_get_value(node, 1)
-        vi = self.known_vi_[node.output[0]]
-        if shape_value is None:
-            shape_shape = self._get_shape(node, 1)
-            assert len(shape_shape) == 1
-            shape_rank = shape_shape[0]
-            assert is_literal(shape_rank)
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    vi.type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(
-                        self._new_symbolic_shape(shape_rank, node)
-                    ),
-                )
-            )
-        else:
-            input_sympy_shape = self._get_sympy_shape(node, 0)
-            total = int(1)
-            for d in input_sympy_shape:
-                total = total * d
-            new_sympy_shape = []
-            deferred_dim_idx = -1
-            non_deferred_size = int(1)
-            for i, d in enumerate(shape_value):
-                if type(d) == sympy.Symbol:
-                    new_sympy_shape.append(d)
-                elif d == 0:
-                    new_sympy_shape.append(input_sympy_shape[i])
-                    non_deferred_size = non_deferred_size * input_sympy_shape[i]
-                else:
-                    new_sympy_shape.append(d)
-                if d == -1:
-                    deferred_dim_idx = i
-                elif d != 0:
-                    non_deferred_size = non_deferred_size * d
-
-            assert new_sympy_shape.count(-1) < 2
-            if -1 in new_sympy_shape:
-                new_dim = total // non_deferred_size
-                new_sympy_shape[deferred_dim_idx] = new_dim
-
-            self._update_computed_dims(new_sympy_shape)
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    vi.type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(new_sympy_shape),
-                )
-            )
-
-        self._pass_on_sympy_data(node)
-
-    def _infer_Resize(self, node):
-        vi = self.known_vi_[node.output[0]]
-        input_sympy_shape = self._get_sympy_shape(node, 0)
-        if get_opset(self.out_mp_) <= 10:
-            scales = self._try_get_value(node, 1)
-            if scales is not None:
-                new_sympy_shape = [
-                    sympy.simplify(sympy.floor(d * s))
-                    for d, s in zip(input_sympy_shape, scales)
-                ]
-                self._update_computed_dims(new_sympy_shape)
-                vi.CopyFrom(
-                    helper.make_tensor_value_info(
-                        node.output[0],
-                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                        get_shape_from_sympy_shape(new_sympy_shape),
-                    )
-                )
-        else:
-            roi = self._try_get_value(node, 1)
-            scales = self._try_get_value(node, 2)
-            sizes = self._try_get_value(node, 3)
-            if sizes is not None:
-                new_sympy_shape = [sympy.simplify(sympy.floor(s)) for s in sizes]
-                self._update_computed_dims(new_sympy_shape)
-            elif scales is not None:
-                rank = len(scales)
-                if (
-                    get_attribute(node, "coordinate_transformation_mode")
-                    == "tf_crop_and_resize"
-                ):
-                    assert len(roi) == 2 * rank
-                    roi_start = list(roi)[:rank]
-                    roi_end = list(roi)[rank:]
-                else:
-                    roi_start = [0] * rank
-                    roi_end = [1] * rank
-                scales = list(scales)
-                new_sympy_shape = [
-                    sympy.simplify(sympy.floor(d * (end - start) * scale))
-                    for d, start, end, scale in zip(
-                        input_sympy_shape, roi_start, roi_end, scales
-                    )
-                ]
-                self._update_computed_dims(new_sympy_shape)
-            else:
-                new_sympy_shape = self._new_symbolic_shape(
-                    self._get_shape_rank(node, 0), node
-                )
-
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[0],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(new_sympy_shape),
-                )
-            )
-
-    def _infer_Scan(self, node):
-        subgraph = get_attribute(node, "body")
-        num_scan_inputs = get_attribute(node, "num_scan_inputs")
-        scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs)
-        num_scan_states = len(node.input) - num_scan_inputs
-        scan_input_axes = [
-            handle_negative_axis(ax, self._get_shape_rank(node, i + num_scan_states))
-            for i, ax in enumerate(scan_input_axes)
-        ]
-        # We may have cases where the subgraph has optional inputs that appear in both subgraph's input and initializer,
-        # but not in the node's input. In such cases, the input model might be invalid, but let's skip those optional inputs.
-        assert len(subgraph.input) >= len(node.input)
-        subgraph_inputs = subgraph.input[: len(node.input)]
-        for i, si in enumerate(subgraph_inputs):
-            subgraph_name = si.name
-            si.CopyFrom(self.known_vi_[node.input[i]])
-            if i >= num_scan_states:
-                scan_input_dim = si.type.tensor_type.shape.dim
-                scan_input_dim.remove(
-                    scan_input_dim[scan_input_axes[i - num_scan_states]]
-                )
-            si.name = subgraph_name
-        self._onnx_infer_subgraph(node, subgraph)
-        num_scan_outputs = len(node.output) - num_scan_states
-        scan_output_axes = get_attribute(
-            node, "scan_output_axes", [0] * num_scan_outputs
-        )
-        scan_input_dim = get_shape_from_type_proto(self.known_vi_[node.input[-1]].type)[
-            scan_input_axes[-1]
-        ]
-        for i, o in enumerate(node.output):
-            vi = self.known_vi_[o]
-            if i >= num_scan_states:
-                shape = get_shape_from_type_proto(subgraph.output[i].type)
-                new_dim = handle_negative_axis(
-                    scan_output_axes[i - num_scan_states], len(shape) + 1
-                )
-                shape = shape[:new_dim] + [scan_input_dim] + shape[new_dim:]
-                vi.CopyFrom(
-                    helper.make_tensor_value_info(
-                        o, subgraph.output[i].type.tensor_type.elem_type, shape
-                    )
-                )
-            else:
-                vi.CopyFrom(subgraph.output[i])
-            vi.name = o
-
-    def _infer_ScatterElements(self, node):
-        data_shape = self._get_shape(node, 0)
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                data_shape,
-            )
-        )
-
-    def _infer_SequenceAt(self, node):
-        # need to create new symbolic dimension if sequence shape has None:
-        seq_shape = self._get_shape(node, 0)
-        vi = self.known_vi_[node.output[0]]
-        if seq_shape is not None:
-            for di, d in enumerate(seq_shape):
-                if d is not None:
-                    continue
-                new_dim = onnx.TensorShapeProto.Dimension()
-                new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di))
-                vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
-
-    def _infer_SequenceInsert(self, node):
-        # workaround bug in onnx's shape inference
-        vi_seq = self.known_vi_[node.input[0]]
-        vi_tensor = self.known_vi_[node.input[1]]
-        vi_out_seq = self.known_vi_[node.output[0]]
-        vi_out_seq.CopyFrom(vi_seq)
-        vi_out_seq.name = node.output[0]
-        self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type)
-
-    def _infer_Shape(self, node):
-        self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0)
-
-    def _infer_Size(self, node):
-        sympy_shape = self._get_sympy_shape(node, 0)
-        self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape)
-        self.known_vi_[node.output[0]].CopyFrom(
-            helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
-        )
-
-    def _infer_Slice(self, node):
-        def less_equal(x, y):
-            try:
-                return bool(x <= y)
-            except TypeError:
-                pass
-            try:
-                return bool(y >= x)
-            except TypeError:
-                pass
-            try:
-                return bool(-x >= -y)
-            except TypeError:
-                pass
-            try:
-                return bool(-y <= -x)
-            except TypeError:
-                # the last attempt; this may raise TypeError
-                return bool(y - x >= 0)
-
-        def handle_negative_index(index, bound):
-            """normalizes a negative index to be in [0, bound)"""
-            try:
-                if not less_equal(0, index):
-                    if is_literal(index) and index <= -self.int_max_:
-                        # this case is handled separately
-                        return index
-                    return bound + index
-            except TypeError:
-                logger.warning("Cannot determine if {} < 0".format(index))
-            return index
-
-        if get_opset(self.out_mp_) <= 9:
-            axes = get_attribute(node, "axes")
-            starts = get_attribute(node, "starts")
-            ends = get_attribute(node, "ends")
-            if not axes:
-                axes = list(range(len(starts)))
-            steps = [1] * len(axes)
-        else:
-            starts = as_list(self._try_get_value(node, 1), keep_none=True)
-            ends = as_list(self._try_get_value(node, 2), keep_none=True)
-            axes = self._try_get_value(node, 3)
-            steps = self._try_get_value(node, 4)
-            if axes is None and not (starts is None and ends is None):
-                axes = list(range(0, len(starts if starts is not None else ends)))
-            if steps is None and not (starts is None and ends is None):
-                steps = [1] * len(starts if starts is not None else ends)
-            axes = as_list(axes, keep_none=True)
-            steps = as_list(steps, keep_none=True)
-
-        new_sympy_shape = self._get_sympy_shape(node, 0)
-        if starts is None or ends is None:
-            if axes is None:
-                for i in range(len(new_sympy_shape)):
-                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
-            else:
-                new_sympy_shape = get_shape_from_sympy_shape(new_sympy_shape)
-                for i in axes:
-                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
-        else:
-            for i, s, e, t in zip(axes, starts, ends, steps):
-                e = handle_negative_index(e, new_sympy_shape[i])
-                if is_literal(e):
-                    if e >= self.int_max_:
-                        e = new_sympy_shape[i]
-                    elif e <= -self.int_max_:
-                        e = 0 if s > 0 else -1
-                    elif is_literal(new_sympy_shape[i]):
-                        if e < 0:
-                            e = max(0, e + new_sympy_shape[i])
-                        e = min(e, new_sympy_shape[i])
-                    else:
-                        if e > 0:
-                            e = (
-                                sympy.Min(e, new_sympy_shape[i]) if e > 1 else e
-                            )  # special case for slicing first to make computation easier
-                else:
-                    if is_literal(new_sympy_shape[i]):
-                        e = sympy.Min(e, new_sympy_shape[i])
-                    else:
-                        try:
-                            if not less_equal(e, new_sympy_shape[i]):
-                                e = new_sympy_shape[i]
-                        except Exception:
-                            logger.warning(
-                                "Unable to determine if {} <= {}, treat as equal".format(
-                                    e, new_sympy_shape[i]
-                                )
-                            )
-                            e = new_sympy_shape[i]
-
-                s = handle_negative_index(s, new_sympy_shape[i])
-                if is_literal(new_sympy_shape[i]) and is_literal(s):
-                    s = max(0, min(s, new_sympy_shape[i]))
-
-                new_sympy_shape[i] = sympy.simplify(
-                    (e - s + t + (-1 if t > 0 else 1)) // t
-                )
-
-            self._update_computed_dims(new_sympy_shape)
-
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                vi.type.tensor_type.elem_type,
-                get_shape_from_sympy_shape(new_sympy_shape),
-            )
-        )
-
-        # handle sympy_data if needed, for slice in shape computation
-        if (
-            node.input[0] in self.sympy_data_
-            and [0] == axes
-            and len(starts) == 1
-            and len(ends) == 1
-            and len(steps) == 1
-        ):
-            input_sympy_data = self.sympy_data_[node.input[0]]
-            if type(input_sympy_data) == list or (
-                type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
-            ):
-                self.sympy_data_[node.output[0]] = input_sympy_data[
-                    starts[0] : ends[0] : steps[0]
-                ]
-
-    def _infer_SoftmaxCrossEntropyLoss(self, node):
-        vi = self.known_vi_[node.output[0]]
-        elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-        vi.type.tensor_type.elem_type = elem_type
-        vi.type.tensor_type.shape.CopyFrom(onnx.TensorShapeProto())
-
-        if len(node.output) > 1:
-            data_shape = self._get_shape(node, 0)
-            vi = self.known_vi_[node.output[1]]
-            vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape))
-
-    def _infer_Split_Common(self, node, make_value_info_func):
-        input_sympy_shape = self._get_sympy_shape(node, 0)
-        axis = handle_negative_axis(
-            get_attribute(node, "axis", 0), len(input_sympy_shape)
-        )
-        split = get_attribute(node, "split")
-        if not split:
-            num_outputs = len(node.output)
-            split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
-            self._update_computed_dims(split)
-        else:
-            split = [sympy.Integer(s) for s in split]
-
-        for i_o in range(len(split)):
-            vi = self.known_vi_[node.output[i_o]]
-            vi.CopyFrom(
-                make_value_info_func(
-                    node.output[i_o],
-                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                    get_shape_from_sympy_shape(
-                        input_sympy_shape[:axis]
-                        + [split[i_o]]
-                        + input_sympy_shape[axis + 1 :]
-                    ),
-                )
-            )
-            self.known_vi_[vi.name] = vi
-
-    def _infer_Split(self, node):
-        self._infer_Split_Common(node, helper.make_tensor_value_info)
-
-    def _infer_SplitToSequence(self, node):
-        self._infer_Split_Common(node, helper.make_sequence_value_info)
-
-    def _infer_Squeeze(self, node):
-        input_shape = self._get_shape(node, 0)
-        op_set = get_opset(self.out_mp_)
-
-        # Depending on op-version 'axes' are provided as attribute or via 2nd input
-        if op_set < 13:
-            axes = get_attribute(node, "axes")
-            assert self._try_get_value(node, 1) is None
-        else:
-            axes = self._try_get_value(node, 1)
-            assert get_attribute(node, "axes") is None
-
-        if axes is None:
-            # No axes have been provided (neither via attribute nor via input).
-            # In this case the 'Shape' op should remove all axis with dimension 1.
-            # For symbolic dimensions we guess they are !=1.
-            output_shape = [s for s in input_shape if s != 1]
-            if self.verbose_ > 0:
-                symbolic_dimensions = [s for s in input_shape if type(s) != int]
-                if len(symbolic_dimensions) > 0:
-                    logger.debug(
-                        f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
-                        + f"Assuming the following dimensions are never equal to 1: {symbolic_dimensions}"
-                    )
-        else:
-            axes = [handle_negative_axis(a, len(input_shape)) for a in axes]
-            output_shape = []
-            for i in range(len(input_shape)):
-                if i not in axes:
-                    output_shape.append(input_shape[i])
-                else:
-                    assert input_shape[i] == 1 or type(input_shape[i]) != int
-                    if self.verbose_ > 0 and type(input_shape[i]) != int:
-                        logger.debug(
-                            f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
-                            + f"Assuming the dimension '{input_shape[i]}' at index {i} of the input to be equal to 1."
-                        )
-
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                output_shape,
-            )
-        )
-        self._pass_on_sympy_data(node)
-
-    def _infer_Tile(self, node):
-        repeats_value = self._try_get_value(node, 1)
-        new_sympy_shape = []
-        if repeats_value is not None:
-            input_sympy_shape = self._get_sympy_shape(node, 0)
-            for i, d in enumerate(input_sympy_shape):
-                new_dim = d * repeats_value[i]
-                new_sympy_shape.append(new_dim)
-            self._update_computed_dims(new_sympy_shape)
-        else:
-            new_sympy_shape = self._new_symbolic_shape(
-                self._get_shape_rank(node, 0), node
-            )
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                vi.type.tensor_type.elem_type,
-                get_shape_from_sympy_shape(new_sympy_shape),
-            )
-        )
-
-    def _infer_TopK(self, node):
-        rank = self._get_shape_rank(node, 0)
-        axis = handle_negative_axis(get_attribute(node, "axis", -1), rank)
-        new_shape = self._get_shape(node, 0)
-
-        if get_opset(self.out_mp_) <= 9:
-            k = get_attribute(node, "k")
-        else:
-            k = self._get_int_values(node)[1]
-
-        if k == None:
-            k = self._new_symbolic_dim_from_output(node)
-        else:
-            k = as_scalar(k)
-
-        if type(k) in [int, str]:
-            new_shape[axis] = k
-        else:
-            new_sympy_shape = self._get_sympy_shape(node, 0)
-            new_sympy_shape[axis] = k
-            self._update_computed_dims(
-                new_sympy_shape
-            )  # note that TopK dim could be computed in sympy_data, so need to update computed_dims when it enters shape
-            new_shape = get_shape_from_sympy_shape(new_sympy_shape)
-
-        for i_o in range(len(node.output)):
-            vi = self.known_vi_[node.output[i_o]]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[i_o], vi.type.tensor_type.elem_type, new_shape
-                )
-            )
-
-    def _infer_Transpose(self, node):
-        if node.input[0] in self.sympy_data_:
-            data_shape = self._get_shape(node, 0)
-            perm = get_attribute(node, "perm", reversed(list(range(len(data_shape)))))
-            input_data = self.sympy_data_[node.input[0]]
-            self.sympy_data_[node.output[0]] = (
-                np.transpose(
-                    np.array(input_data).reshape(*data_shape), axes=tuple(perm)
-                )
-                .flatten()
-                .tolist()
-            )
-
-    def _infer_Unsqueeze(self, node):
-        input_shape = self._get_shape(node, 0)
-        op_set = get_opset(self.out_mp_)
-
-        # Depending on op-version 'axes' are provided as attribute or via 2nd input
-        if op_set < 13:
-            axes = get_attribute(node, "axes")
-            assert self._try_get_value(node, 1) is None
-        else:
-            axes = self._try_get_value(node, 1)
-            assert get_attribute(node, "axes") is None
-
-        output_rank = len(input_shape) + len(axes)
-        axes = [handle_negative_axis(a, output_rank) for a in axes]
-
-        input_axis = 0
-        output_shape = []
-        for i in range(output_rank):
-            if i in axes:
-                output_shape.append(1)
-            else:
-                output_shape.append(input_shape[input_axis])
-                input_axis += 1
-
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0],
-                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
-                output_shape,
-            )
-        )
-
-        self._pass_on_sympy_data(node)
-
-    def _infer_ZipMap(self, node):
-        map_key_type = None
-        if get_attribute(node, "classlabels_int64s") is not None:
-            map_key_type = onnx.TensorProto.INT64
-        elif get_attribute(node, "classlabels_strings") is not None:
-            map_key_type = onnx.TensorProto.STRING
-
-        assert map_key_type is not None
-        new_vi = onnx.ValueInfoProto()
-        new_vi.name = node.output[0]
-        new_vi.type.sequence_type.elem_type.map_type.value_type.tensor_type.elem_type = (
-            onnx.TensorProto.FLOAT
-        )
-        new_vi.type.sequence_type.elem_type.map_type.key_type = map_key_type
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(new_vi)
-
-    def _infer_Attention(self, node):
-        shape = self._get_shape(node, 0)
-        shape_bias = self._get_shape(node, 2)
-        assert len(shape) == 3 and len(shape_bias) == 1
-        qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes")
-        if qkv_hidden_sizes_attr is not None:
-            assert len(qkv_hidden_sizes_attr) == 3
-            shape[2] = int(qkv_hidden_sizes_attr[2])
-        else:
-            shape[2] = int(shape_bias[0] / 3)
-        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape))
-
-        if len(node.output) > 1:
-            # input shape: (batch_size, sequence_length, hidden_size)
-            # past shape: (2, batch_size, num_heads, past_sequence_length, head_size)
-            # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len)
-            # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length
-            input_shape = self._get_shape(node, 0)
-            past_shape = self._get_shape(node, 4)
-            mask_shape = self._get_shape(node, 3)
-            if len(past_shape) == 5:
-                if len(mask_shape) in [2, 3]:
-                    past_shape[3] = mask_shape[-1]
-                elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int):
-                    past_shape[3] = input_shape[1] + past_shape[3]
-                else:
-                    past_shape[3] = f"{past_shape[3]}+{input_shape[1]}"
-                vi = self.known_vi_[node.output[1]]
-                vi.CopyFrom(
-                    helper.make_tensor_value_info(vi.name, output_dtype, past_shape)
-                )
-
-    def _infer_BiasGelu(self, node):
-        self._propagate_shape_and_type(node)
-
-    def _infer_FastGelu(self, node):
-        self._propagate_shape_and_type(node)
-
-    def _infer_Gelu(self, node):
-        self._propagate_shape_and_type(node)
-
-    def _infer_LayerNormalization(self, node):
-        self._propagate_shape_and_type(node)
-
-    def _infer_LongformerAttention(self, node):
-        self._propagate_shape_and_type(node)
-
-    def _infer_EmbedLayerNormalization(self, node):
-        input_ids_shape = self._get_shape(node, 0)
-        word_embedding_shape = self._get_shape(node, 2)
-        assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2
-        output_shape = input_ids_shape + [word_embedding_shape[1]]
-
-        word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[0], word_embedding_dtype, output_shape
-            )
-        )
-
-        mask_index_shape = [input_ids_shape[0]]
-        vi = self.known_vi_[node.output[1]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[1], onnx.TensorProto.INT32, mask_index_shape
-            )
-        )
-
-        if len(node.output) > 2:
-            # Optional output of add before layer nomalization is done
-            # shape is same as the output
-            vi = self.known_vi_[node.output[2]]
-            vi.CopyFrom(
-                helper.make_tensor_value_info(
-                    node.output[2], word_embedding_dtype, output_shape
-                )
-            )
-
-    def _infer_SkipLayerNormalization(self, node):
-        self._propagate_shape_and_type(node)
-
-    def _infer_PythonOp(self, node):
-        output_tensor_types = get_attribute(node, "output_tensor_types")
-        assert output_tensor_types
-        output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
-        assert output_tensor_ranks
-
-        # set the context output seperately.
-        # The first output is autograd's context.
-        vi = self.known_vi_[node.output[0]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
-        )
-
-        # Outputs after autograd's context are tensors.
-        # We assume their ranks are fixed for different model inputs.
-        for i in range(len(node.output) - 1):
-            # Process the i-th tensor outputs.
-            vi = self.known_vi_[node.output[i + 1]]
-            sympy_shape = self._new_symbolic_shape(output_tensor_ranks[i], node)
-            shape = get_shape_from_sympy_shape(sympy_shape)
-            value_info = helper.make_tensor_value_info(
-                node.output[i + 1], output_tensor_types[i], shape
-            )
-            vi.CopyFrom(value_info)
-
-    def _propagate_shape_and_type(self, node, input_index=0, output_index=0):
-        shape = self._get_shape(node, input_index)
-        output_dtype = self.known_vi_[
-            node.input[input_index]
-        ].type.tensor_type.elem_type
-        vi = self.known_vi_[node.output[output_index]]
-        vi.CopyFrom(
-            helper.make_tensor_value_info(
-                node.output[output_index], output_dtype, shape
-            )
-        )
-
-    def _is_none_dim(self, dim_value):
-        if type(dim_value) != str:
-            return False
-        if "unk__" not in dim_value:
-            return False
-        if dim_value in self.symbolic_dims_.keys():
-            return False
-        return True
-
-    def _is_shape_contains_none_dim(self, out_shape):
-        for out in out_shape:
-            if self._is_none_dim(out):
-                return out
-        return None
-
-    def _infer_impl(self, start_sympy_data=None):
-        self.sympy_data_ = start_sympy_data or {}
-        self.out_mp_.graph.ClearField("value_info")
-        self._apply_suggested_merge(graph_input_only=True)
-        self.input_symbols_ = set()
-        for i in self.out_mp_.graph.input:
-            input_shape = get_shape_from_value_info(i)
-            if input_shape is None:
-                continue
-
-            if is_sequence(i.type):
-                input_dims = i.type.sequence_type.elem_type.tensor_type.shape.dim
-            else:
-                input_dims = i.type.tensor_type.shape.dim
-
-            for i_dim, dim in enumerate(input_shape):
-                if dim is None:
-                    # some models use None for symbolic dim in input, replace it with a string
-                    input_dims[i_dim].dim_param = str(
-                        self._new_symbolic_dim(i.name, i_dim)
-                    )
-
-            self.input_symbols_.update([d for d in input_shape if type(d) == str])
-
-        for s in self.input_symbols_:
-            if s in self.suggested_merge_:
-                s_merge = self.suggested_merge_[s]
-                assert s_merge in self.symbolic_dims_
-                self.symbolic_dims_[s] = self.symbolic_dims_[s_merge]
-            else:
-                # Since inputs are not produced by other ops, we can assume positivity
-                self.symbolic_dims_[s] = sympy.Symbol(s, integer=True, positive=True)
-        # create a temporary ModelProto for single node inference
-        # note that we remove initializer to have faster inference
-        # for tensor ops like Reshape/Tile/Expand that read initializer, we need to do sympy computation based inference anyways
-        self.tmp_mp_ = onnx.ModelProto()
-        self.tmp_mp_.CopyFrom(self.out_mp_)
-        self.tmp_mp_.graph.ClearField("initializer")
-
-        # compute prerequesite for node for topological sort
-        # node with subgraphs may have dependency on implicit inputs, which will affect topological sort
-        prereq_for_node = (
-            {}
-        )  # map from node to all its inputs, including implicit ones in subgraph
-
-        def get_prereq(node):
-            names = set(i for i in node.input if i)
-            subgraphs = []
-            if "If" == node.op_type:
-                subgraphs = [
-                    get_attribute(node, "then_branch"),
-                    get_attribute(node, "else_branch"),
-                ]
-            elif node.op_type in ["Loop", "Scan"]:
-                subgraphs = [get_attribute(node, "body")]
-            for g in subgraphs:
-                g_outputs_and_initializers = {i.name for i in g.initializer}
-                g_prereq = set()
-                for n in g.node:
-                    g_outputs_and_initializers.update(n.output)
-                for n in g.node:
-                    g_prereq.update(
-                        [
-                            i
-                            for i in get_prereq(n)
-                            if i not in g_outputs_and_initializers
-                        ]
-                    )
-                names.update(g_prereq)
-                # remove subgraph inputs from g_prereq since those are local-only
-                for i in g.input:
-                    if i.name in names:
-                        names.remove(i.name)
-            return names
-
-        for n in self.tmp_mp_.graph.node:
-            prereq_for_node[n.output[0]] = get_prereq(n)
-
-        # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate
-        sorted_nodes = []
-        sorted_known_vi = set(
-            [
-                i.name
-                for i in list(self.out_mp_.graph.input)
-                + list(self.out_mp_.graph.initializer)
-            ]
-        )
-        if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
-            # Loop/Scan will have some graph output in graph inputs, so don't do topological sort
-            sorted_nodes = self.out_mp_.graph.node
-        else:
-            while not all(
-                [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
-            ):
-                old_sorted_nodes_len = len(sorted_nodes)
-                for node in self.out_mp_.graph.node:
-                    if (node.output[0] not in sorted_known_vi) and all(
-                        [
-                            i in sorted_known_vi
-                            for i in prereq_for_node[node.output[0]]
-                            if i
-                        ]
-                    ):
-                        sorted_known_vi.update(node.output)
-                        sorted_nodes.append(node)
-                if old_sorted_nodes_len == len(sorted_nodes) and not all(
-                    [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
-                ):
-                    raise Exception("Invalid model with cyclic graph")
-
-        for node in sorted_nodes:
-            assert all([i in self.known_vi_ for i in node.input if i])
-            self._onnx_infer_single_node(node)
-            known_aten_op = False
-            if node.op_type in self.dispatcher_:
-                self.dispatcher_[node.op_type](node)
-            elif node.op_type in ["ConvTranspose"]:
-                # onnx shape inference ops like ConvTranspose may have empty shape for symbolic input
-                # before adding symbolic compute for them
-                # mark the output type as UNDEFINED to allow guessing of rank
-                vi = self.known_vi_[node.output[0]]
-                if len(vi.type.tensor_type.shape.dim) == 0:
-                    vi.type.tensor_type.elem_type = onnx.TensorProto.UNDEFINED
-            elif node.op_type == "ATen" and node.domain == "org.pytorch.aten":
-                for attr in node.attribute:
-                    # TODO: Is overload_name needed?
-                    if attr.name == "operator":
-                        aten_op_name = (
-                            attr.s.decode("utf-8")
-                            if isinstance(attr.s, bytes)
-                            else attr.s
-                        )
-                        if aten_op_name in self.aten_op_dispatcher_:
-                            known_aten_op = True
-                            self.aten_op_dispatcher_[aten_op_name](node)
-                        break
-
-            if self.verbose_ > 2:
-                logger.debug(node.op_type + ": " + node.name)
-                for i, name in enumerate(node.input):
-                    logger.debug(
-                        "  Input {}: {} {}".format(
-                            i, name, "initializer" if name in self.initializers_ else ""
-                        )
-                    )
-
-            # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
-            # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
-            if node.op_type in [
-                "Add",
-                "Sub",
-                "Mul",
-                "Div",
-                "MatMul",
-                "MatMulInteger",
-                "MatMulInteger16",
-                "Where",
-                "Sum",
-            ]:
-                vi = self.known_vi_[node.output[0]]
-                out_rank = len(get_shape_from_type_proto(vi.type))
-                in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
-                for d in range(
-                    out_rank
-                    - (
-                        2
-                        if node.op_type
-                        in ["MatMul", "MatMulInteger", "MatMulInteger16"]
-                        else 0
-                    )
-                ):
-                    in_dims = [
-                        s[len(s) - out_rank + d]
-                        for s in in_shapes
-                        if len(s) + d >= out_rank
-                    ]
-                    if len(in_dims) > 1:
-                        self._check_merged_dims(in_dims, allow_broadcast=True)
-
-            for i_o in range(len(node.output)):
-                vi = self.known_vi_[node.output[i_o]]
-                out_type = vi.type
-                out_type_kind = out_type.WhichOneof("value")
-
-                # do not process shape for non-tensors
-                if out_type_kind not in ["tensor_type", "sparse_tensor_type", None]:
-                    if self.verbose_ > 2:
-                        if out_type_kind == "sequence_type":
-                            seq_cls_type = out_type.sequence_type.elem_type.WhichOneof(
-                                "value"
-                            )
-                            if "tensor_type" == seq_cls_type:
-                                logger.debug(
-                                    "  {}: sequence of {} {}".format(
-                                        node.output[i_o],
-                                        str(get_shape_from_value_info(vi)),
-                                        onnx.TensorProto.DataType.Name(
-                                            vi.type.sequence_type.elem_type.tensor_type.elem_type
-                                        ),
-                                    )
-                                )
-                            else:
-                                logger.debug(
-                                    "  {}: sequence of {}".format(
-                                        node.output[i_o], seq_cls_type
-                                    )
-                                )
-                        else:
-                            logger.debug(
-                                "  {}: {}".format(node.output[i_o], out_type_kind)
-                            )
-                    continue
-
-                out_shape = get_shape_from_value_info(vi)
-                out_type_undefined = (
-                    out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
-                )
-                if self.verbose_ > 2:
-                    logger.debug(
-                        "  {}: {} {}".format(
-                            node.output[i_o],
-                            str(out_shape),
-                            onnx.TensorProto.DataType.Name(
-                                vi.type.tensor_type.elem_type
-                            ),
-                        )
-                    )
-                    if node.output[i_o] in self.sympy_data_:
-                        logger.debug(
-                            "  Sympy Data: " + str(self.sympy_data_[node.output[i_o]])
-                        )
-
-                # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
-                if (
-                    out_shape is not None
-                    and (
-                        None in out_shape or self._is_shape_contains_none_dim(out_shape)
-                    )
-                ) or out_type_undefined:
-                    if self.auto_merge_:
-                        if node.op_type in [
-                            "Add",
-                            "Sub",
-                            "Mul",
-                            "Div",
-                            "MatMul",
-                            "MatMulInteger",
-                            "MatMulInteger16",
-                            "Concat",
-                            "Where",
-                            "Sum",
-                            "Equal",
-                            "Less",
-                            "Greater",
-                            "LessOrEqual",
-                            "GreaterOrEqual",
-                            "Min",
-                            "Max",
-                        ]:
-                            shapes = [
-                                self._get_shape(node, i) for i in range(len(node.input))
-                            ]
-                            if node.op_type in [
-                                "MatMul",
-                                "MatMulInteger",
-                                "MatMulInteger16",
-                            ]:
-                                if (
-                                    None in out_shape
-                                    or self._is_shape_contains_none_dim(out_shape)
-                                ):
-                                    if None in out_shape:
-                                        idx = out_shape.index(None)
-                                    else:
-                                        idx = out_shape.index(
-                                            self._is_shape_contains_none_dim(out_shape)
-                                        )
-                                    dim_idx = [
-                                        len(s) - len(out_shape) + idx for s in shapes
-                                    ]
-                                    # only support auto merge for MatMul for dim < rank-2 when rank > 2
-                                    assert (
-                                        len(shapes[0]) > 2
-                                        and dim_idx[0] < len(shapes[0]) - 2
-                                    )
-                                    assert (
-                                        len(shapes[1]) > 2
-                                        and dim_idx[1] < len(shapes[1]) - 2
-                                    )
-                        elif node.op_type == "Expand":
-                            # auto merge for cases like Expand([min(batch, 1), min(seq, 512)], [batch, seq])
-                            shapes = [
-                                self._get_shape(node, 0),
-                                self._get_value(node, 1),
-                            ]
-                        else:
-                            shapes = []
-
-                        if shapes:
-                            for idx in range(len(out_shape)):
-                                if out_shape[idx] is not None and not self._is_none_dim(
-                                    out_shape[idx]
-                                ):
-                                    continue
-                                # note that the broadcasting rule aligns from right to left
-                                # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge
-                                dim_idx = [
-                                    len(s) - len(out_shape) + idx for s in shapes
-                                ]
-                                if len(dim_idx) > 0:
-                                    self._add_suggested_merge(
-                                        [
-                                            s[i] if is_literal(s[i]) else str(s[i])
-                                            for s, i in zip(shapes, dim_idx)
-                                            if i >= 0
-                                        ]
-                                    )
-                            self.run_ = True
-                        else:
-                            self.run_ = False
-                    else:
-                        self.run_ = False
-
-                    # create new dynamic dims for ops not handled by symbolic shape inference
-                    if (
-                        self.run_ == False
-                        and not node.op_type in self.dispatcher_
-                        and not known_aten_op
-                    ):
-                        is_unknown_op = out_type_undefined and (
-                            out_shape is None or len(out_shape) == 0
-                        )
-                        if is_unknown_op:
-                            # unknown op to ONNX, maybe from higher opset or other domain
-                            # only guess the output rank from input 0 when using guess_output_rank option
-                            out_rank = (
-                                self._get_shape_rank(node, 0)
-                                if self.guess_output_rank_
-                                else -1
-                            )
-                        else:
-                            # valid ONNX op, but not handled by symbolic shape inference, just assign dynamic shape
-                            out_rank = len(out_shape)
-
-                        if out_rank >= 0:
-                            new_shape = self._new_symbolic_shape(out_rank, node, i_o)
-                            if out_type_undefined:
-                                # guess output data type from input vi if not defined
-                                out_dtype = self.known_vi_[
-                                    node.input[0]
-                                ].type.tensor_type.elem_type
-                            else:
-                                # otherwise, use original data type
-                                out_dtype = vi.type.tensor_type.elem_type
-                            vi.CopyFrom(
-                                helper.make_tensor_value_info(
-                                    vi.name,
-                                    out_dtype,
-                                    get_shape_from_sympy_shape(new_shape),
-                                )
-                            )
-
-                            if self.verbose_ > 0:
-                                if is_unknown_op:
-                                    logger.debug(
-                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
-                                            node.op_type, node.name, vi.name
-                                        )
-                                    )
-                                if self.verbose_ > 2:
-                                    logger.debug(
-                                        "  {}: {} {}".format(
-                                            node.output[i_o],
-                                            str(new_shape),
-                                            vi.type.tensor_type.elem_type,
-                                        )
-                                    )
-
-                            self.run_ = True
-                            continue  # continue the inference after guess, no need to stop as no merge is needed
-
-                    if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
-                        logger.debug(
-                            "Stopping at incomplete shape inference at "
-                            + node.op_type
-                            + ": "
-                            + node.name
-                        )
-                        logger.debug("node inputs:")
-                        for i in node.input:
-                            logger.debug(self.known_vi_[i])
-                        logger.debug("node outputs:")
-                        for o in node.output:
-                            logger.debug(self.known_vi_[o])
-                        if self.auto_merge_ and not out_type_undefined:
-                            logger.debug("Merging: " + str(self.suggested_merge_))
-                    return False
-
-        self.run_ = False
-        return True
-
-    def _update_output_from_vi(self):
-        for output in self.out_mp_.graph.output:
-            if output.name in self.known_vi_:
-                output.CopyFrom(self.known_vi_[output.name])
-
-    @staticmethod
-    def infer_shapes(
-        in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0
-    ):
-        onnx_opset = get_opset(in_mp)
-        if (not onnx_opset) or onnx_opset < 7:
-            logger.warning("Only support models of onnx opset 7 and above.")
-            return None
-        symbolic_shape_inference = SymbolicShapeInference(
-            int_max, auto_merge, guess_output_rank, verbose
-        )
-        all_shapes_inferred = False
-        symbolic_shape_inference._preprocess(in_mp)
-        while symbolic_shape_inference.run_:
-            all_shapes_inferred = symbolic_shape_inference._infer_impl()
-        symbolic_shape_inference._update_output_from_vi()
-        if not all_shapes_inferred:
-            logger.warning("Incomplete symbolic shape inference")
-        return symbolic_shape_inference.out_mp_
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input", required=True, help="The input model file")
-    parser.add_argument("--output", help="The output model file")
-    parser.add_argument(
-        "--auto_merge",
-        help="Automatically merge symbolic dims when confliction happens",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--int_max",
-        help="maximum value for integer to be treated as boundless for ops like slice",
-        type=int,
-        default=2**31 - 1,
-    )
-    parser.add_argument(
-        "--guess_output_rank",
-        help="guess output rank to be the same as input 0 for unknown ops",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--verbose",
-        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
-        type=int,
-        default=0,
-    )
-    parser.add_argument(
-        "--save_as_external_data",
-        help="Saving an ONNX model to external data",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--all_tensors_to_one_file",
-        help="Saving all the external data to one file",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--external_data_location",
-        help="The file location to save the external file",
-        default="./",
-    )
-    parser.add_argument(
-        "--external_data_size_threshold",
-        help="The size threshold for external data",
-        type=int,
-        default=1024,
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    logger.info("input model: " + args.input)
-    if args.output:
-        logger.info("output model " + args.output)
-    logger.info("Doing symbolic shape inference...")
-    out_mp = SymbolicShapeInference.infer_shapes(
-        onnx.load(args.input),
-        args.int_max,
-        args.auto_merge,
-        args.guess_output_rank,
-        args.verbose,
-    )
-    if args.output and out_mp:
-        if args.save_as_external_data:
-            onnx.save_model(
-                out_mp,
-                args.output,
-                save_as_external_data=True,
-                all_tensors_to_one_file=args.all_tensors_to_one_file,
-                location=args.external_data_location,
-                size_threshold=args.external_data_size_threshold,
-                convert_attribute=False,
-            )
-        else:
-            onnx.save(out_mp, args.output)
-        logger.info("Done!")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
deleted file mode 100644
index b80f9f4022328703df32af16182ea930645a6db6..0000000000000000000000000000000000000000
--- a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-
-onnxsim
-packaging
-sympy